fix: Pipes in math and code blocks within table cells

2026-06-13 11:25:03 +03:00 · 2026-05-28 21:42:08 -04:00
parent ae5cd6a159
commit d581de55f7
4 changed files with 614 additions and 8 deletions
@@ -2,7 +2,7 @@ import crypto from "node:crypto";
 import { Node } from "prosemirror-model";
 import { ProsemirrorHelper as ServerProsemirrorHelper } from "@server/models/helpers/ProsemirrorHelper";
 import { ProsemirrorHelper } from "@shared/utils/ProsemirrorHelper";
-import { schema } from "@server/editor";
+import { parser, schema, serializer } from "@server/editor";

 // Note: The test is here rather than shared to access the schema
 describe("#ProsemirrorHelper", () => {
@@ -184,6 +184,115 @@ describe("#ProsemirrorHelper", () => {
    });
  });

+  describe("table markdown round trip", () => {
+    const roundTrip = (md: string) => {
+      const doc = parser.parse(md);
+      expect(doc).not.toBeNull();
+      const first = serializer.serialize(doc!);
+      const second = serializer.serialize(parser.parse(first)!);
+      return { first, second };
+    };
+
+    const getCellTexts = (md: string) => {
+      const doc = parser.parse(md)!;
+      const table = doc.content.firstChild!;
+      expect(table.type.name).toBe("table");
+      const rows: string[][] = [];
+      table.forEach((row) => {
+        const cells: string[] = [];
+        row.forEach((cell) => cells.push(cell.textContent));
+        rows.push(cells);
+      });
+      return rows;
+    };
+
+    it("preserves a single inline code span containing pipes", () => {
+      const cells = getCellTexts(
+        ["| A | B |", "| --- | --- |", "| x | `|y|` |", ""].join("\n")
+      );
+
+      expect(cells).toEqual([
+        ["A", "B"],
+        ["x", "|y|"],
+      ]);
+    });
+
+    it("preserves multiple inline code spans with pipes in the same cell", () => {
+      const md = [
+        "| Condition | Facts |",
+        "| --- | --- |",
+        "| Absolute time difference | The system checks `|Clock_NTP_Camera1 - Clock_GPS_Camera1|` and `|Clock_NTP_Camera2 - Clock_GPS_Camera2|`. |",
+        "",
+      ].join("\n");
+
+      const cells = getCellTexts(md);
+      expect(cells).toHaveLength(2);
+      expect(cells[1][0]).toBe("Absolute time difference");
+      expect(cells[1][1]).toBe(
+        "The system checks |Clock_NTP_Camera1 - Clock_GPS_Camera1| and |Clock_NTP_Camera2 - Clock_GPS_Camera2|."
+      );
+    });
+
+    it("preserves inline math containing pipes", () => {
+      const cells = getCellTexts(
+        ["| A | B |", "| --- | --- |", "| x | $|a-b|$ |", ""].join("\n")
+      );
+
+      expect(cells[1][0]).toBe("x");
+      expect(cells[1][1]).toBe("|a-b|");
+    });
+
+    it("preserves identifiers with underscores and braces inside code spans", () => {
+      const cells = getCellTexts(
+        [
+          "| Field | Value |",
+          "| --- | --- |",
+          "| ID | `foo_{bar}|baz_{qux}` |",
+          "",
+        ].join("\n")
+      );
+
+      expect(cells[1][1]).toBe("foo_{bar}|baz_{qux}");
+    });
+
+    it("re-serializes a table with code-span pipes idempotently", () => {
+      const { first, second } = roundTrip(
+        ["| A | B |", "| --- | --- |", "| x | `|y|` |", ""].join("\n")
+      );
+
+      expect(second).toBe(first);
+    });
+
+    it("re-serializes a table with prose plus code-span pipes idempotently", () => {
+      const { first, second } = roundTrip(
+        [
+          "| Condition | Facts |",
+          "| --- | --- |",
+          "| Absolute time difference | The system checks `|Clock_NTP - Clock_GPS|`. |",
+          "",
+        ].join("\n")
+      );
+
+      expect(second).toBe(first);
+    });
+
+    it("re-serializes a table with inline math pipes idempotently", () => {
+      const { first, second } = roundTrip(
+        ["| A | B |", "| --- | --- |", "| x | $|a-b|$ |", ""].join("\n")
+      );
+
+      expect(second).toBe(first);
+    });
+
+    it("still splits cells on unescaped pipes outside code spans", () => {
+      const cells = getCellTexts(
+        ["| A | B | C |", "| --- | --- | --- |", "| x | y | z |", ""].join("\n")
+      );
+
+      expect(cells[1]).toEqual(["x", "y", "z"]);
+    });
+  });
+
  describe("#removeMarks", () => {
    it("preserves table cell background color when removing comment marks", () => {
      const doc = Node.fromJSON(schema, {
@@ -345,16 +345,13 @@ export class MarkdownSerializerState {
          this.text(this.markString(add, true, parent, index), false);
        }

-        // Render the node. Special case code marks, since their content is not
-        // escaped, apart from pipes in tables.
+        // Render the node. Special case code marks, since their content is
+        // not escaped. Pipes inside code spans are safe inside tables because
+        // the block table rule recognises code spans when splitting cells.
        if (noEsc && node.isText) {
-          const text = this.inTable
-            ? node.text.replace(/\|/gi, "\\$&")
-            : node.text;
-
          this.text(
            this.markString(inner, true, parent, index) +
-              text +
+              node.text +
              this.markString(inner, false, parent, index + 1),
            false
          );
@@ -0,0 +1,134 @@
+import markdownit from "markdown-it";
+import tablesRule from "./tables";
+
+type Cell = { content: string };
+
+function parseRows(md: string): Cell[][] {
+  const parser = markdownit("default").use(tablesRule);
+  const tokens = parser.parse(md, {});
+
+  const rows: Cell[][] = [];
+  let row: Cell[] | null = null;
+
+  for (const token of tokens) {
+    if (token.type === "tr_open") {
+      row = [];
+    } else if (token.type === "tr_close" && row) {
+      rows.push(row);
+      row = null;
+    } else if ((token.type === "th_open" || token.type === "td_open") && row) {
+      // The next inline token holds the cell text.
+      row.push({ content: "" });
+    } else if (token.type === "inline" && row && row.length > 0) {
+      row[row.length - 1].content = token.content;
+    }
+  }
+
+  return rows;
+}
+
+describe("table rule", () => {
+  describe("pipes inside code spans", () => {
+    it("does not split a cell on pipes inside a backtick code span", () => {
+      const md = ["| A | B |", "| --- | --- |", "| x | `|y|` |", ""].join("\n");
+
+      const rows = parseRows(md);
+
+      expect(rows).toHaveLength(2);
+      expect(rows[0].map((c) => c.content)).toEqual(["A", "B"]);
+      expect(rows[1].map((c) => c.content)).toEqual(["x", "`|y|`"]);
+    });
+
+    it("preserves multiple code-spanned expressions with pipes in one cell", () => {
+      const md = [
+        "| Condition | Facts |",
+        "| --- | --- |",
+        "| Absolute time difference | The system checks `|Clock_NTP - Clock_GPS|` and `|Clock_NTP2 - Clock_GPS2|`. |",
+        "",
+      ].join("\n");
+
+      const rows = parseRows(md);
+
+      expect(rows).toHaveLength(2);
+      expect(rows[1]).toHaveLength(2);
+      expect(rows[1][0].content).toBe("Absolute time difference");
+      expect(rows[1][1].content).toBe(
+        "The system checks `|Clock_NTP - Clock_GPS|` and `|Clock_NTP2 - Clock_GPS2|`."
+      );
+    });
+
+    it("handles double-backtick code spans containing single backticks and pipes", () => {
+      const md = ["| A | B |", "| --- | --- |", "| x | ``a|b`c`` |", ""].join(
+        "\n"
+      );
+
+      const rows = parseRows(md);
+
+      expect(rows[1].map((c) => c.content)).toEqual(["x", "``a|b`c``"]);
+    });
+
+    it("still treats unfenced pipes as cell separators", () => {
+      const md = [
+        "| A | B | C |",
+        "| --- | --- | --- |",
+        "| x | y | z |",
+        "",
+      ].join("\n");
+
+      const rows = parseRows(md);
+
+      expect(rows[1].map((c) => c.content)).toEqual(["x", "y", "z"]);
+    });
+
+    it("treats backslash-escaped pipes inside code spans as literal", () => {
+      const md = ["| A | B |", "| --- | --- |", "| x | `\\|y\\|` |", ""].join(
+        "\n"
+      );
+
+      const rows = parseRows(md);
+
+      // Backslash escapes are not processed inside code spans, but the
+      // pipes are inside the code span so they must not split the cell.
+      expect(rows[1]).toHaveLength(2);
+      expect(rows[1][0].content).toBe("x");
+      // The cell should still contain the original code-span text (the
+      // exact serialized form is not important here, but it must not be
+      // truncated).
+      expect(rows[1][1].content).toContain("y");
+      expect(rows[1][1].content).toContain("|");
+    });
+  });
+
+  describe("pipes inside inline math", () => {
+    it("does not split a cell on pipes inside $...$ math", () => {
+      const md = ["| A | B |", "| --- | --- |", "| x | $|a-b|$ |", ""].join(
+        "\n"
+      );
+
+      const rows = parseRows(md);
+
+      expect(rows[1]).toHaveLength(2);
+      expect(rows[1][1].content).toBe("$|a-b|$");
+    });
+
+    it("does not treat lone dollar signs as math openers", () => {
+      const md = ["| A | B |", "| --- | --- |", "| $5 | $10 |", ""].join("\n");
+
+      const rows = parseRows(md);
+
+      // Lone dollar amounts shouldn't open math spans and merge cells.
+      expect(rows[1].map((c) => c.content)).toEqual(["$5", "$10"]);
+    });
+  });
+
+  describe("backslash escapes outside code spans", () => {
+    it("still honors \\| as an escaped cell-pipe", () => {
+      const md = ["| A | B |", "| --- | --- |", "| x | a\\|b |", ""].join("\n");
+
+      const rows = parseRows(md);
+
+      expect(rows[1]).toHaveLength(2);
+      expect(rows[1][1].content).toContain("|");
+    });
+  });
+});
@@ -1,4 +1,5 @@
 import type MarkdownIt from "markdown-it";
+import type StateBlock from "markdown-it/lib/rules_block/state_block.mjs";

 const BREAK_REGEX = /(?<=^|[^\\])\\n/;
 const BR_TAG_REGEX = /<br\s*\/?>/gi;
@@ -6,7 +7,372 @@ const BR_TAG_REGEX = /<br\s*\/?>/gi;
 // Stops at <br> or newline to handle multiple checkboxes in a cell
 const CHECKBOX_REGEX = /^(?:-\s*)?\[(X|\s|_|-)\]\s([^<\n]*)?/i;

+const TAB = 0x09;
+const SPACE = 0x20;
+const DOLLAR = 0x24;
+const BACKSLASH = 0x5c;
+const BACKTICK = 0x60;
+const PIPE = 0x7c;
+const HYPHEN = 0x2d;
+const COLON = 0x3a;
+const DIGIT_0 = 0x30;
+const DIGIT_9 = 0x39;
+const MAX_AUTOCOMPLETED_CELLS = 0x10000;
+
+function isSpace(code: number): boolean {
+  return code === SPACE || code === TAB;
+}
+
+/**
+ * Skip over a backtick-delimited code span starting at `pos`.
+ *
+ * @returns the position just past the closing run if a balanced span exists,
+ * otherwise -1 (indicating the backticks at `pos` are literal text).
+ */
+function skipCodeSpan(str: string, pos: number): number {
+  const max = str.length;
+  const runStart = pos;
+  while (pos < max && str.charCodeAt(pos) === BACKTICK) {
+    pos++;
+  }
+  const tickCount = pos - runStart;
+
+  let scan = pos;
+  while (scan < max) {
+    if (str.charCodeAt(scan) !== BACKTICK) {
+      scan++;
+      continue;
+    }
+    const closeStart = scan;
+    while (scan < max && str.charCodeAt(scan) === BACKTICK) {
+      scan++;
+    }
+    if (scan - closeStart === tickCount) {
+      return scan;
+    }
+  }
+  return -1;
+}
+
+/**
+ * Skip over an inline $...$ math span starting at `pos`.
+ *
+ * Mirrors the opener/closer constraints used by the math inline rule so that
+ * literal dollar amounts (e.g. "$5") do not accidentally consume a cell.
+ *
+ * @returns the position just past the closing `$` if a balanced span exists,
+ * otherwise -1.
+ */
+function skipMathSpan(str: string, pos: number): number {
+  const max = str.length;
+  // Opener: next char must not be whitespace.
+  const next = pos + 1 < max ? str.charCodeAt(pos + 1) : -1;
+  if (next === -1 || isSpace(next)) {
+    return -1;
+  }
+
+  let scan = pos + 1;
+  while (scan < max) {
+    const ch = str.charCodeAt(scan);
+    if (ch === BACKSLASH) {
+      // Skip escaped char.
+      scan += 2;
+      continue;
+    }
+    if (ch === DOLLAR) {
+      const prev = str.charCodeAt(scan - 1);
+      const after = scan + 1 < max ? str.charCodeAt(scan + 1) : -1;
+      // Closer: prev not whitespace, next not a digit.
+      if (!isSpace(prev) && !(after >= DIGIT_0 && after <= DIGIT_9)) {
+        return scan + 1;
+      }
+    }
+    scan++;
+  }
+  return -1;
+}
+
+/**
+ * Split a table row on unescaped pipes, while ignoring pipes that fall inside
+ * inline code spans or inline math spans. This is a superset of markdown-it's
+ * default behaviour, which only honours `\|` escapes and so silently truncates
+ * cells whose content contains literal `|` characters inside `` `...` `` or
+ * `$...$`.
+ */
+function escapedSplit(str: string): string[] {
+  const result: string[] = [];
+  const max = str.length;
+
+  let pos = 0;
+  let isEscaped = false;
+  let lastPos = 0;
+  let current = "";
+
+  while (pos < max) {
+    const ch = str.charCodeAt(pos);
+
+    if (!isEscaped && ch === BACKTICK) {
+      const end = skipCodeSpan(str, pos);
+      if (end !== -1) {
+        pos = end;
+        isEscaped = false;
+        continue;
+      }
+    } else if (!isEscaped && ch === DOLLAR) {
+      const end = skipMathSpan(str, pos);
+      if (end !== -1) {
+        pos = end;
+        isEscaped = false;
+        continue;
+      }
+    }
+
+    if (ch === PIPE) {
+      if (!isEscaped) {
+        result.push(current + str.substring(lastPos, pos));
+        current = "";
+        lastPos = pos + 1;
+      } else {
+        // Escaped pipe `\|` becomes part of the current cell, dropping the
+        // leading backslash.
+        current += str.substring(lastPos, pos - 1);
+        lastPos = pos;
+      }
+    }
+
+    isEscaped = ch === BACKSLASH;
+    pos++;
+  }
+
+  result.push(current + str.substring(lastPos));
+  return result;
+}
+
+function getLine(state: StateBlock, line: number): string {
+  const pos = state.bMarks[line] + state.tShift[line];
+  const max = state.eMarks[line];
+  return state.src.slice(pos, max);
+}
+
+/**
+ * GFM table block rule, forked from markdown-it. The only behavioural change
+ * from the upstream rule is that {@link escapedSplit} also recognises
+ * backtick-delimited code spans and `$...$` math spans, so pipes inside such
+ * spans no longer split the row.
+ */
+function tableRule(
+  state: StateBlock,
+  startLine: number,
+  endLine: number,
+  silent: boolean
+): boolean {
+  if (startLine + 2 > endLine) {
+    return false;
+  }
+
+  let nextLine = startLine + 1;
+  if (state.sCount[nextLine] < state.blkIndent) {
+    return false;
+  }
+  if (state.sCount[nextLine] - state.blkIndent >= 4) {
+    return false;
+  }
+
+  let pos = state.bMarks[nextLine] + state.tShift[nextLine];
+  if (pos >= state.eMarks[nextLine]) {
+    return false;
+  }
+
+  const firstCh = state.src.charCodeAt(pos++);
+  if (firstCh !== PIPE && firstCh !== HYPHEN && firstCh !== COLON) {
+    return false;
+  }
+  if (pos >= state.eMarks[nextLine]) {
+    return false;
+  }
+
+  const secondCh = state.src.charCodeAt(pos++);
+  if (
+    secondCh !== PIPE &&
+    secondCh !== HYPHEN &&
+    secondCh !== COLON &&
+    !isSpace(secondCh)
+  ) {
+    return false;
+  }
+  if (firstCh === HYPHEN && isSpace(secondCh)) {
+    return false;
+  }
+
+  while (pos < state.eMarks[nextLine]) {
+    const ch = state.src.charCodeAt(pos);
+    if (ch !== PIPE && ch !== HYPHEN && ch !== COLON && !isSpace(ch)) {
+      return false;
+    }
+    pos++;
+  }
+
+  let lineText = getLine(state, startLine + 1);
+  let columns = lineText.split("|");
+  const aligns: string[] = [];
+  for (let i = 0; i < columns.length; i++) {
+    const t = columns[i].trim();
+    if (!t) {
+      if (i === 0 || i === columns.length - 1) {
+        continue;
+      } else {
+        return false;
+      }
+    }
+    if (!/^:?-+:?$/.test(t)) {
+      return false;
+    }
+    if (t.charCodeAt(t.length - 1) === COLON) {
+      aligns.push(t.charCodeAt(0) === COLON ? "center" : "right");
+    } else if (t.charCodeAt(0) === COLON) {
+      aligns.push("left");
+    } else {
+      aligns.push("");
+    }
+  }
+
+  lineText = getLine(state, startLine).trim();
+  if (lineText.indexOf("|") === -1) {
+    return false;
+  }
+  if (state.sCount[startLine] - state.blkIndent >= 4) {
+    return false;
+  }
+  columns = escapedSplit(lineText);
+  if (columns.length && columns[0] === "") {
+    columns.shift();
+  }
+  if (columns.length && columns[columns.length - 1] === "") {
+    columns.pop();
+  }
+
+  const columnCount = columns.length;
+  if (columnCount === 0 || columnCount !== aligns.length) {
+    return false;
+  }
+
+  if (silent) {
+    return true;
+  }
+
+  const oldParentType = state.parentType;
+  // The StateBlock type only knows the standard parent-type strings. Casting
+  // keeps us in lockstep with the upstream rule.
+  state.parentType = "table" as typeof state.parentType;
+
+  const terminatorRules = state.md.block.ruler.getRules("blockquote");
+
+  const tableTokenOpen = state.push("table_open", "table", 1);
+  const tableLines: [number, number] = [startLine, 0];
+  tableTokenOpen.map = tableLines;
+
+  const tHeadOpen = state.push("thead_open", "thead", 1);
+  tHeadOpen.map = [startLine, startLine + 1];
+
+  const headerRowOpen = state.push("tr_open", "tr", 1);
+  headerRowOpen.map = [startLine, startLine + 1];
+
+  for (let i = 0; i < columns.length; i++) {
+    const thOpen = state.push("th_open", "th", 1);
+    if (aligns[i]) {
+      thOpen.attrs = [["style", "text-align:" + aligns[i]]];
+    }
+    const inlineToken = state.push("inline", "", 0);
+    inlineToken.content = columns[i].trim();
+    inlineToken.children = [];
+    state.push("th_close", "th", -1);
+  }
+
+  state.push("tr_close", "tr", -1);
+  state.push("thead_close", "thead", -1);
+
+  let tBodyLines: [number, number] | undefined;
+  let autocompletedCells = 0;
+
+  for (nextLine = startLine + 2; nextLine < endLine; nextLine++) {
+    if (state.sCount[nextLine] < state.blkIndent) {
+      break;
+    }
+
+    let terminate = false;
+    for (let i = 0, l = terminatorRules.length; i < l; i++) {
+      if (terminatorRules[i](state, nextLine, endLine, true)) {
+        terminate = true;
+        break;
+      }
+    }
+    if (terminate) {
+      break;
+    }
+
+    lineText = getLine(state, nextLine).trim();
+    if (!lineText) {
+      break;
+    }
+    if (state.sCount[nextLine] - state.blkIndent >= 4) {
+      break;
+    }
+
+    columns = escapedSplit(lineText);
+    if (columns.length && columns[0] === "") {
+      columns.shift();
+    }
+    if (columns.length && columns[columns.length - 1] === "") {
+      columns.pop();
+    }
+
+    autocompletedCells += columnCount - columns.length;
+    if (autocompletedCells > MAX_AUTOCOMPLETED_CELLS) {
+      break;
+    }
+
+    if (nextLine === startLine + 2) {
+      const tBodyOpen = state.push("tbody_open", "tbody", 1);
+      tBodyLines = [startLine + 2, 0];
+      tBodyOpen.map = tBodyLines;
+    }
+
+    const rowOpen = state.push("tr_open", "tr", 1);
+    rowOpen.map = [nextLine, nextLine + 1];
+
+    for (let i = 0; i < columnCount; i++) {
+      const tdOpen = state.push("td_open", "td", 1);
+      if (aligns[i]) {
+        tdOpen.attrs = [["style", "text-align:" + aligns[i]]];
+      }
+      const inlineToken = state.push("inline", "", 0);
+      inlineToken.content = columns[i] ? columns[i].trim() : "";
+      inlineToken.children = [];
+      state.push("td_close", "td", -1);
+    }
+    state.push("tr_close", "tr", -1);
+  }
+
+  if (tBodyLines) {
+    state.push("tbody_close", "tbody", -1);
+    tBodyLines[1] = nextLine;
+  }
+
+  state.push("table_close", "table", -1);
+  tableLines[1] = nextLine;
+
+  state.parentType = oldParentType;
+  state.line = nextLine;
+  return true;
+}
+
 export default function markdownTables(md: MarkdownIt): void {
+  // Replace the built-in GFM table rule with one that ignores pipes inside
+  // inline code and math spans when splitting cells. Without this, content
+  // such as `` `|a-b|` `` would silently split a row into extra cells and
+  // truncate the document on round-trip.
+  md.block.ruler.at("table", tableRule, { alt: ["paragraph", "reference"] });
+
  // insert a new rule after the "inline" rules are parsed
  md.core.ruler.after("inline", "tables-pm", (state) => {
    const tokens = state.tokens;