From d581de55f7d67d2299ee8fd9cadf0c621fca4657 Mon Sep 17 00:00:00 2001 From: Tom Moor Date: Thu, 28 May 2026 21:42:08 -0400 Subject: [PATCH] fix: Pipes in math and code blocks within table cells --- server/utils/ProsemirrorHelper.test.ts | 111 ++++++- shared/editor/lib/markdown/serializer.ts | 11 +- shared/editor/rules/tables.test.ts | 134 +++++++++ shared/editor/rules/tables.ts | 366 +++++++++++++++++++++++ 4 files changed, 614 insertions(+), 8 deletions(-) create mode 100644 shared/editor/rules/tables.test.ts diff --git a/server/utils/ProsemirrorHelper.test.ts b/server/utils/ProsemirrorHelper.test.ts index 36a20bc5a2..87402ca4ad 100644 --- a/server/utils/ProsemirrorHelper.test.ts +++ b/server/utils/ProsemirrorHelper.test.ts @@ -2,7 +2,7 @@ import crypto from "node:crypto"; import { Node } from "prosemirror-model"; import { ProsemirrorHelper as ServerProsemirrorHelper } from "@server/models/helpers/ProsemirrorHelper"; import { ProsemirrorHelper } from "@shared/utils/ProsemirrorHelper"; -import { schema } from "@server/editor"; +import { parser, schema, serializer } from "@server/editor"; // Note: The test is here rather than shared to access the schema describe("#ProsemirrorHelper", () => { @@ -184,6 +184,115 @@ describe("#ProsemirrorHelper", () => { }); }); + describe("table markdown round trip", () => { + const roundTrip = (md: string) => { + const doc = parser.parse(md); + expect(doc).not.toBeNull(); + const first = serializer.serialize(doc!); + const second = serializer.serialize(parser.parse(first)!); + return { first, second }; + }; + + const getCellTexts = (md: string) => { + const doc = parser.parse(md)!; + const table = doc.content.firstChild!; + expect(table.type.name).toBe("table"); + const rows: string[][] = []; + table.forEach((row) => { + const cells: string[] = []; + row.forEach((cell) => cells.push(cell.textContent)); + rows.push(cells); + }); + return rows; + }; + + it("preserves a single inline code span containing pipes", () => { + const cells = getCellTexts( + ["| A | B |", "| --- | --- |", "| x | `|y|` |", ""].join("\n") + ); + + expect(cells).toEqual([ + ["A", "B"], + ["x", "|y|"], + ]); + }); + + it("preserves multiple inline code spans with pipes in the same cell", () => { + const md = [ + "| Condition | Facts |", + "| --- | --- |", + "| Absolute time difference | The system checks `|Clock_NTP_Camera1 - Clock_GPS_Camera1|` and `|Clock_NTP_Camera2 - Clock_GPS_Camera2|`. |", + "", + ].join("\n"); + + const cells = getCellTexts(md); + expect(cells).toHaveLength(2); + expect(cells[1][0]).toBe("Absolute time difference"); + expect(cells[1][1]).toBe( + "The system checks |Clock_NTP_Camera1 - Clock_GPS_Camera1| and |Clock_NTP_Camera2 - Clock_GPS_Camera2|." + ); + }); + + it("preserves inline math containing pipes", () => { + const cells = getCellTexts( + ["| A | B |", "| --- | --- |", "| x | $|a-b|$ |", ""].join("\n") + ); + + expect(cells[1][0]).toBe("x"); + expect(cells[1][1]).toBe("|a-b|"); + }); + + it("preserves identifiers with underscores and braces inside code spans", () => { + const cells = getCellTexts( + [ + "| Field | Value |", + "| --- | --- |", + "| ID | `foo_{bar}|baz_{qux}` |", + "", + ].join("\n") + ); + + expect(cells[1][1]).toBe("foo_{bar}|baz_{qux}"); + }); + + it("re-serializes a table with code-span pipes idempotently", () => { + const { first, second } = roundTrip( + ["| A | B |", "| --- | --- |", "| x | `|y|` |", ""].join("\n") + ); + + expect(second).toBe(first); + }); + + it("re-serializes a table with prose plus code-span pipes idempotently", () => { + const { first, second } = roundTrip( + [ + "| Condition | Facts |", + "| --- | --- |", + "| Absolute time difference | The system checks `|Clock_NTP - Clock_GPS|`. |", + "", + ].join("\n") + ); + + expect(second).toBe(first); + }); + + it("re-serializes a table with inline math pipes idempotently", () => { + const { first, second } = roundTrip( + ["| A | B |", "| --- | --- |", "| x | $|a-b|$ |", ""].join("\n") + ); + + expect(second).toBe(first); + }); + + it("still splits cells on unescaped pipes outside code spans", () => { + const cells = getCellTexts( + ["| A | B | C |", "| --- | --- | --- |", "| x | y | z |", ""].join("\n") + ); + + expect(cells[1]).toEqual(["x", "y", "z"]); + }); + }); + describe("#removeMarks", () => { it("preserves table cell background color when removing comment marks", () => { const doc = Node.fromJSON(schema, { diff --git a/shared/editor/lib/markdown/serializer.ts b/shared/editor/lib/markdown/serializer.ts index 85a1d82ad3..73be98853a 100644 --- a/shared/editor/lib/markdown/serializer.ts +++ b/shared/editor/lib/markdown/serializer.ts @@ -345,16 +345,13 @@ export class MarkdownSerializerState { this.text(this.markString(add, true, parent, index), false); } - // Render the node. Special case code marks, since their content is not - // escaped, apart from pipes in tables. + // Render the node. Special case code marks, since their content is + // not escaped. Pipes inside code spans are safe inside tables because + // the block table rule recognises code spans when splitting cells. if (noEsc && node.isText) { - const text = this.inTable - ? node.text.replace(/\|/gi, "\\$&") - : node.text; - this.text( this.markString(inner, true, parent, index) + - text + + node.text + this.markString(inner, false, parent, index + 1), false ); diff --git a/shared/editor/rules/tables.test.ts b/shared/editor/rules/tables.test.ts new file mode 100644 index 0000000000..482f8e6bb5 --- /dev/null +++ b/shared/editor/rules/tables.test.ts @@ -0,0 +1,134 @@ +import markdownit from "markdown-it"; +import tablesRule from "./tables"; + +type Cell = { content: string }; + +function parseRows(md: string): Cell[][] { + const parser = markdownit("default").use(tablesRule); + const tokens = parser.parse(md, {}); + + const rows: Cell[][] = []; + let row: Cell[] | null = null; + + for (const token of tokens) { + if (token.type === "tr_open") { + row = []; + } else if (token.type === "tr_close" && row) { + rows.push(row); + row = null; + } else if ((token.type === "th_open" || token.type === "td_open") && row) { + // The next inline token holds the cell text. + row.push({ content: "" }); + } else if (token.type === "inline" && row && row.length > 0) { + row[row.length - 1].content = token.content; + } + } + + return rows; +} + +describe("table rule", () => { + describe("pipes inside code spans", () => { + it("does not split a cell on pipes inside a backtick code span", () => { + const md = ["| A | B |", "| --- | --- |", "| x | `|y|` |", ""].join("\n"); + + const rows = parseRows(md); + + expect(rows).toHaveLength(2); + expect(rows[0].map((c) => c.content)).toEqual(["A", "B"]); + expect(rows[1].map((c) => c.content)).toEqual(["x", "`|y|`"]); + }); + + it("preserves multiple code-spanned expressions with pipes in one cell", () => { + const md = [ + "| Condition | Facts |", + "| --- | --- |", + "| Absolute time difference | The system checks `|Clock_NTP - Clock_GPS|` and `|Clock_NTP2 - Clock_GPS2|`. |", + "", + ].join("\n"); + + const rows = parseRows(md); + + expect(rows).toHaveLength(2); + expect(rows[1]).toHaveLength(2); + expect(rows[1][0].content).toBe("Absolute time difference"); + expect(rows[1][1].content).toBe( + "The system checks `|Clock_NTP - Clock_GPS|` and `|Clock_NTP2 - Clock_GPS2|`." + ); + }); + + it("handles double-backtick code spans containing single backticks and pipes", () => { + const md = ["| A | B |", "| --- | --- |", "| x | ``a|b`c`` |", ""].join( + "\n" + ); + + const rows = parseRows(md); + + expect(rows[1].map((c) => c.content)).toEqual(["x", "``a|b`c``"]); + }); + + it("still treats unfenced pipes as cell separators", () => { + const md = [ + "| A | B | C |", + "| --- | --- | --- |", + "| x | y | z |", + "", + ].join("\n"); + + const rows = parseRows(md); + + expect(rows[1].map((c) => c.content)).toEqual(["x", "y", "z"]); + }); + + it("treats backslash-escaped pipes inside code spans as literal", () => { + const md = ["| A | B |", "| --- | --- |", "| x | `\\|y\\|` |", ""].join( + "\n" + ); + + const rows = parseRows(md); + + // Backslash escapes are not processed inside code spans, but the + // pipes are inside the code span so they must not split the cell. + expect(rows[1]).toHaveLength(2); + expect(rows[1][0].content).toBe("x"); + // The cell should still contain the original code-span text (the + // exact serialized form is not important here, but it must not be + // truncated). + expect(rows[1][1].content).toContain("y"); + expect(rows[1][1].content).toContain("|"); + }); + }); + + describe("pipes inside inline math", () => { + it("does not split a cell on pipes inside $...$ math", () => { + const md = ["| A | B |", "| --- | --- |", "| x | $|a-b|$ |", ""].join( + "\n" + ); + + const rows = parseRows(md); + + expect(rows[1]).toHaveLength(2); + expect(rows[1][1].content).toBe("$|a-b|$"); + }); + + it("does not treat lone dollar signs as math openers", () => { + const md = ["| A | B |", "| --- | --- |", "| $5 | $10 |", ""].join("\n"); + + const rows = parseRows(md); + + // Lone dollar amounts shouldn't open math spans and merge cells. + expect(rows[1].map((c) => c.content)).toEqual(["$5", "$10"]); + }); + }); + + describe("backslash escapes outside code spans", () => { + it("still honors \\| as an escaped cell-pipe", () => { + const md = ["| A | B |", "| --- | --- |", "| x | a\\|b |", ""].join("\n"); + + const rows = parseRows(md); + + expect(rows[1]).toHaveLength(2); + expect(rows[1][1].content).toContain("|"); + }); + }); +}); diff --git a/shared/editor/rules/tables.ts b/shared/editor/rules/tables.ts index ffaaf4def2..350c38e6e7 100644 --- a/shared/editor/rules/tables.ts +++ b/shared/editor/rules/tables.ts @@ -1,4 +1,5 @@ import type MarkdownIt from "markdown-it"; +import type StateBlock from "markdown-it/lib/rules_block/state_block.mjs"; const BREAK_REGEX = /(?<=^|[^\\])\\n/; const BR_TAG_REGEX = //gi; @@ -6,7 +7,372 @@ const BR_TAG_REGEX = //gi; // Stops at
or newline to handle multiple checkboxes in a cell const CHECKBOX_REGEX = /^(?:-\s*)?\[(X|\s|_|-)\]\s([^<\n]*)?/i; +const TAB = 0x09; +const SPACE = 0x20; +const DOLLAR = 0x24; +const BACKSLASH = 0x5c; +const BACKTICK = 0x60; +const PIPE = 0x7c; +const HYPHEN = 0x2d; +const COLON = 0x3a; +const DIGIT_0 = 0x30; +const DIGIT_9 = 0x39; +const MAX_AUTOCOMPLETED_CELLS = 0x10000; + +function isSpace(code: number): boolean { + return code === SPACE || code === TAB; +} + +/** + * Skip over a backtick-delimited code span starting at `pos`. + * + * @returns the position just past the closing run if a balanced span exists, + * otherwise -1 (indicating the backticks at `pos` are literal text). + */ +function skipCodeSpan(str: string, pos: number): number { + const max = str.length; + const runStart = pos; + while (pos < max && str.charCodeAt(pos) === BACKTICK) { + pos++; + } + const tickCount = pos - runStart; + + let scan = pos; + while (scan < max) { + if (str.charCodeAt(scan) !== BACKTICK) { + scan++; + continue; + } + const closeStart = scan; + while (scan < max && str.charCodeAt(scan) === BACKTICK) { + scan++; + } + if (scan - closeStart === tickCount) { + return scan; + } + } + return -1; +} + +/** + * Skip over an inline $...$ math span starting at `pos`. + * + * Mirrors the opener/closer constraints used by the math inline rule so that + * literal dollar amounts (e.g. "$5") do not accidentally consume a cell. + * + * @returns the position just past the closing `$` if a balanced span exists, + * otherwise -1. + */ +function skipMathSpan(str: string, pos: number): number { + const max = str.length; + // Opener: next char must not be whitespace. + const next = pos + 1 < max ? str.charCodeAt(pos + 1) : -1; + if (next === -1 || isSpace(next)) { + return -1; + } + + let scan = pos + 1; + while (scan < max) { + const ch = str.charCodeAt(scan); + if (ch === BACKSLASH) { + // Skip escaped char. + scan += 2; + continue; + } + if (ch === DOLLAR) { + const prev = str.charCodeAt(scan - 1); + const after = scan + 1 < max ? str.charCodeAt(scan + 1) : -1; + // Closer: prev not whitespace, next not a digit. + if (!isSpace(prev) && !(after >= DIGIT_0 && after <= DIGIT_9)) { + return scan + 1; + } + } + scan++; + } + return -1; +} + +/** + * Split a table row on unescaped pipes, while ignoring pipes that fall inside + * inline code spans or inline math spans. This is a superset of markdown-it's + * default behaviour, which only honours `\|` escapes and so silently truncates + * cells whose content contains literal `|` characters inside `` `...` `` or + * `$...$`. + */ +function escapedSplit(str: string): string[] { + const result: string[] = []; + const max = str.length; + + let pos = 0; + let isEscaped = false; + let lastPos = 0; + let current = ""; + + while (pos < max) { + const ch = str.charCodeAt(pos); + + if (!isEscaped && ch === BACKTICK) { + const end = skipCodeSpan(str, pos); + if (end !== -1) { + pos = end; + isEscaped = false; + continue; + } + } else if (!isEscaped && ch === DOLLAR) { + const end = skipMathSpan(str, pos); + if (end !== -1) { + pos = end; + isEscaped = false; + continue; + } + } + + if (ch === PIPE) { + if (!isEscaped) { + result.push(current + str.substring(lastPos, pos)); + current = ""; + lastPos = pos + 1; + } else { + // Escaped pipe `\|` becomes part of the current cell, dropping the + // leading backslash. + current += str.substring(lastPos, pos - 1); + lastPos = pos; + } + } + + isEscaped = ch === BACKSLASH; + pos++; + } + + result.push(current + str.substring(lastPos)); + return result; +} + +function getLine(state: StateBlock, line: number): string { + const pos = state.bMarks[line] + state.tShift[line]; + const max = state.eMarks[line]; + return state.src.slice(pos, max); +} + +/** + * GFM table block rule, forked from markdown-it. The only behavioural change + * from the upstream rule is that {@link escapedSplit} also recognises + * backtick-delimited code spans and `$...$` math spans, so pipes inside such + * spans no longer split the row. + */ +function tableRule( + state: StateBlock, + startLine: number, + endLine: number, + silent: boolean +): boolean { + if (startLine + 2 > endLine) { + return false; + } + + let nextLine = startLine + 1; + if (state.sCount[nextLine] < state.blkIndent) { + return false; + } + if (state.sCount[nextLine] - state.blkIndent >= 4) { + return false; + } + + let pos = state.bMarks[nextLine] + state.tShift[nextLine]; + if (pos >= state.eMarks[nextLine]) { + return false; + } + + const firstCh = state.src.charCodeAt(pos++); + if (firstCh !== PIPE && firstCh !== HYPHEN && firstCh !== COLON) { + return false; + } + if (pos >= state.eMarks[nextLine]) { + return false; + } + + const secondCh = state.src.charCodeAt(pos++); + if ( + secondCh !== PIPE && + secondCh !== HYPHEN && + secondCh !== COLON && + !isSpace(secondCh) + ) { + return false; + } + if (firstCh === HYPHEN && isSpace(secondCh)) { + return false; + } + + while (pos < state.eMarks[nextLine]) { + const ch = state.src.charCodeAt(pos); + if (ch !== PIPE && ch !== HYPHEN && ch !== COLON && !isSpace(ch)) { + return false; + } + pos++; + } + + let lineText = getLine(state, startLine + 1); + let columns = lineText.split("|"); + const aligns: string[] = []; + for (let i = 0; i < columns.length; i++) { + const t = columns[i].trim(); + if (!t) { + if (i === 0 || i === columns.length - 1) { + continue; + } else { + return false; + } + } + if (!/^:?-+:?$/.test(t)) { + return false; + } + if (t.charCodeAt(t.length - 1) === COLON) { + aligns.push(t.charCodeAt(0) === COLON ? "center" : "right"); + } else if (t.charCodeAt(0) === COLON) { + aligns.push("left"); + } else { + aligns.push(""); + } + } + + lineText = getLine(state, startLine).trim(); + if (lineText.indexOf("|") === -1) { + return false; + } + if (state.sCount[startLine] - state.blkIndent >= 4) { + return false; + } + columns = escapedSplit(lineText); + if (columns.length && columns[0] === "") { + columns.shift(); + } + if (columns.length && columns[columns.length - 1] === "") { + columns.pop(); + } + + const columnCount = columns.length; + if (columnCount === 0 || columnCount !== aligns.length) { + return false; + } + + if (silent) { + return true; + } + + const oldParentType = state.parentType; + // The StateBlock type only knows the standard parent-type strings. Casting + // keeps us in lockstep with the upstream rule. + state.parentType = "table" as typeof state.parentType; + + const terminatorRules = state.md.block.ruler.getRules("blockquote"); + + const tableTokenOpen = state.push("table_open", "table", 1); + const tableLines: [number, number] = [startLine, 0]; + tableTokenOpen.map = tableLines; + + const tHeadOpen = state.push("thead_open", "thead", 1); + tHeadOpen.map = [startLine, startLine + 1]; + + const headerRowOpen = state.push("tr_open", "tr", 1); + headerRowOpen.map = [startLine, startLine + 1]; + + for (let i = 0; i < columns.length; i++) { + const thOpen = state.push("th_open", "th", 1); + if (aligns[i]) { + thOpen.attrs = [["style", "text-align:" + aligns[i]]]; + } + const inlineToken = state.push("inline", "", 0); + inlineToken.content = columns[i].trim(); + inlineToken.children = []; + state.push("th_close", "th", -1); + } + + state.push("tr_close", "tr", -1); + state.push("thead_close", "thead", -1); + + let tBodyLines: [number, number] | undefined; + let autocompletedCells = 0; + + for (nextLine = startLine + 2; nextLine < endLine; nextLine++) { + if (state.sCount[nextLine] < state.blkIndent) { + break; + } + + let terminate = false; + for (let i = 0, l = terminatorRules.length; i < l; i++) { + if (terminatorRules[i](state, nextLine, endLine, true)) { + terminate = true; + break; + } + } + if (terminate) { + break; + } + + lineText = getLine(state, nextLine).trim(); + if (!lineText) { + break; + } + if (state.sCount[nextLine] - state.blkIndent >= 4) { + break; + } + + columns = escapedSplit(lineText); + if (columns.length && columns[0] === "") { + columns.shift(); + } + if (columns.length && columns[columns.length - 1] === "") { + columns.pop(); + } + + autocompletedCells += columnCount - columns.length; + if (autocompletedCells > MAX_AUTOCOMPLETED_CELLS) { + break; + } + + if (nextLine === startLine + 2) { + const tBodyOpen = state.push("tbody_open", "tbody", 1); + tBodyLines = [startLine + 2, 0]; + tBodyOpen.map = tBodyLines; + } + + const rowOpen = state.push("tr_open", "tr", 1); + rowOpen.map = [nextLine, nextLine + 1]; + + for (let i = 0; i < columnCount; i++) { + const tdOpen = state.push("td_open", "td", 1); + if (aligns[i]) { + tdOpen.attrs = [["style", "text-align:" + aligns[i]]]; + } + const inlineToken = state.push("inline", "", 0); + inlineToken.content = columns[i] ? columns[i].trim() : ""; + inlineToken.children = []; + state.push("td_close", "td", -1); + } + state.push("tr_close", "tr", -1); + } + + if (tBodyLines) { + state.push("tbody_close", "tbody", -1); + tBodyLines[1] = nextLine; + } + + state.push("table_close", "table", -1); + tableLines[1] = nextLine; + + state.parentType = oldParentType; + state.line = nextLine; + return true; +} + export default function markdownTables(md: MarkdownIt): void { + // Replace the built-in GFM table rule with one that ignores pipes inside + // inline code and math spans when splitting cells. Without this, content + // such as `` `|a-b|` `` would silently split a row into extra cells and + // truncate the document on round-trip. + md.block.ruler.at("table", tableRule, { alt: ["paragraph", "reference"] }); + // insert a new rule after the "inline" rules are parsed md.core.ruler.after("inline", "tables-pm", (state) => { const tokens = state.tokens;