fix: Pipes in math and code blocks within table cells

This commit is contained in:
Tom Moor
2026-05-28 21:42:08 -04:00
parent ae5cd6a159
commit d581de55f7
4 changed files with 614 additions and 8 deletions
+110 -1
View File
@@ -2,7 +2,7 @@ import crypto from "node:crypto";
import { Node } from "prosemirror-model";
import { ProsemirrorHelper as ServerProsemirrorHelper } from "@server/models/helpers/ProsemirrorHelper";
import { ProsemirrorHelper } from "@shared/utils/ProsemirrorHelper";
import { schema } from "@server/editor";
import { parser, schema, serializer } from "@server/editor";
// Note: The test is here rather than shared to access the schema
describe("#ProsemirrorHelper", () => {
@@ -184,6 +184,115 @@ describe("#ProsemirrorHelper", () => {
});
});
describe("table markdown round trip", () => {
const roundTrip = (md: string) => {
const doc = parser.parse(md);
expect(doc).not.toBeNull();
const first = serializer.serialize(doc!);
const second = serializer.serialize(parser.parse(first)!);
return { first, second };
};
const getCellTexts = (md: string) => {
const doc = parser.parse(md)!;
const table = doc.content.firstChild!;
expect(table.type.name).toBe("table");
const rows: string[][] = [];
table.forEach((row) => {
const cells: string[] = [];
row.forEach((cell) => cells.push(cell.textContent));
rows.push(cells);
});
return rows;
};
it("preserves a single inline code span containing pipes", () => {
const cells = getCellTexts(
["| A | B |", "| --- | --- |", "| x | `|y|` |", ""].join("\n")
);
expect(cells).toEqual([
["A", "B"],
["x", "|y|"],
]);
});
it("preserves multiple inline code spans with pipes in the same cell", () => {
const md = [
"| Condition | Facts |",
"| --- | --- |",
"| Absolute time difference | The system checks `|Clock_NTP_Camera1 - Clock_GPS_Camera1|` and `|Clock_NTP_Camera2 - Clock_GPS_Camera2|`. |",
"",
].join("\n");
const cells = getCellTexts(md);
expect(cells).toHaveLength(2);
expect(cells[1][0]).toBe("Absolute time difference");
expect(cells[1][1]).toBe(
"The system checks |Clock_NTP_Camera1 - Clock_GPS_Camera1| and |Clock_NTP_Camera2 - Clock_GPS_Camera2|."
);
});
it("preserves inline math containing pipes", () => {
const cells = getCellTexts(
["| A | B |", "| --- | --- |", "| x | $|a-b|$ |", ""].join("\n")
);
expect(cells[1][0]).toBe("x");
expect(cells[1][1]).toBe("|a-b|");
});
it("preserves identifiers with underscores and braces inside code spans", () => {
const cells = getCellTexts(
[
"| Field | Value |",
"| --- | --- |",
"| ID | `foo_{bar}|baz_{qux}` |",
"",
].join("\n")
);
expect(cells[1][1]).toBe("foo_{bar}|baz_{qux}");
});
it("re-serializes a table with code-span pipes idempotently", () => {
const { first, second } = roundTrip(
["| A | B |", "| --- | --- |", "| x | `|y|` |", ""].join("\n")
);
expect(second).toBe(first);
});
it("re-serializes a table with prose plus code-span pipes idempotently", () => {
const { first, second } = roundTrip(
[
"| Condition | Facts |",
"| --- | --- |",
"| Absolute time difference | The system checks `|Clock_NTP - Clock_GPS|`. |",
"",
].join("\n")
);
expect(second).toBe(first);
});
it("re-serializes a table with inline math pipes idempotently", () => {
const { first, second } = roundTrip(
["| A | B |", "| --- | --- |", "| x | $|a-b|$ |", ""].join("\n")
);
expect(second).toBe(first);
});
it("still splits cells on unescaped pipes outside code spans", () => {
const cells = getCellTexts(
["| A | B | C |", "| --- | --- | --- |", "| x | y | z |", ""].join("\n")
);
expect(cells[1]).toEqual(["x", "y", "z"]);
});
});
describe("#removeMarks", () => {
it("preserves table cell background color when removing comment marks", () => {
const doc = Node.fromJSON(schema, {
+4 -7
View File
@@ -345,16 +345,13 @@ export class MarkdownSerializerState {
this.text(this.markString(add, true, parent, index), false);
}
// Render the node. Special case code marks, since their content is not
// escaped, apart from pipes in tables.
// Render the node. Special case code marks, since their content is
// not escaped. Pipes inside code spans are safe inside tables because
// the block table rule recognises code spans when splitting cells.
if (noEsc && node.isText) {
const text = this.inTable
? node.text.replace(/\|/gi, "\\$&")
: node.text;
this.text(
this.markString(inner, true, parent, index) +
text +
node.text +
this.markString(inner, false, parent, index + 1),
false
);
+134
View File
@@ -0,0 +1,134 @@
import markdownit from "markdown-it";
import tablesRule from "./tables";
type Cell = { content: string };
function parseRows(md: string): Cell[][] {
const parser = markdownit("default").use(tablesRule);
const tokens = parser.parse(md, {});
const rows: Cell[][] = [];
let row: Cell[] | null = null;
for (const token of tokens) {
if (token.type === "tr_open") {
row = [];
} else if (token.type === "tr_close" && row) {
rows.push(row);
row = null;
} else if ((token.type === "th_open" || token.type === "td_open") && row) {
// The next inline token holds the cell text.
row.push({ content: "" });
} else if (token.type === "inline" && row && row.length > 0) {
row[row.length - 1].content = token.content;
}
}
return rows;
}
describe("table rule", () => {
describe("pipes inside code spans", () => {
it("does not split a cell on pipes inside a backtick code span", () => {
const md = ["| A | B |", "| --- | --- |", "| x | `|y|` |", ""].join("\n");
const rows = parseRows(md);
expect(rows).toHaveLength(2);
expect(rows[0].map((c) => c.content)).toEqual(["A", "B"]);
expect(rows[1].map((c) => c.content)).toEqual(["x", "`|y|`"]);
});
it("preserves multiple code-spanned expressions with pipes in one cell", () => {
const md = [
"| Condition | Facts |",
"| --- | --- |",
"| Absolute time difference | The system checks `|Clock_NTP - Clock_GPS|` and `|Clock_NTP2 - Clock_GPS2|`. |",
"",
].join("\n");
const rows = parseRows(md);
expect(rows).toHaveLength(2);
expect(rows[1]).toHaveLength(2);
expect(rows[1][0].content).toBe("Absolute time difference");
expect(rows[1][1].content).toBe(
"The system checks `|Clock_NTP - Clock_GPS|` and `|Clock_NTP2 - Clock_GPS2|`."
);
});
it("handles double-backtick code spans containing single backticks and pipes", () => {
const md = ["| A | B |", "| --- | --- |", "| x | ``a|b`c`` |", ""].join(
"\n"
);
const rows = parseRows(md);
expect(rows[1].map((c) => c.content)).toEqual(["x", "``a|b`c``"]);
});
it("still treats unfenced pipes as cell separators", () => {
const md = [
"| A | B | C |",
"| --- | --- | --- |",
"| x | y | z |",
"",
].join("\n");
const rows = parseRows(md);
expect(rows[1].map((c) => c.content)).toEqual(["x", "y", "z"]);
});
it("treats backslash-escaped pipes inside code spans as literal", () => {
const md = ["| A | B |", "| --- | --- |", "| x | `\\|y\\|` |", ""].join(
"\n"
);
const rows = parseRows(md);
// Backslash escapes are not processed inside code spans, but the
// pipes are inside the code span so they must not split the cell.
expect(rows[1]).toHaveLength(2);
expect(rows[1][0].content).toBe("x");
// The cell should still contain the original code-span text (the
// exact serialized form is not important here, but it must not be
// truncated).
expect(rows[1][1].content).toContain("y");
expect(rows[1][1].content).toContain("|");
});
});
describe("pipes inside inline math", () => {
it("does not split a cell on pipes inside $...$ math", () => {
const md = ["| A | B |", "| --- | --- |", "| x | $|a-b|$ |", ""].join(
"\n"
);
const rows = parseRows(md);
expect(rows[1]).toHaveLength(2);
expect(rows[1][1].content).toBe("$|a-b|$");
});
it("does not treat lone dollar signs as math openers", () => {
const md = ["| A | B |", "| --- | --- |", "| $5 | $10 |", ""].join("\n");
const rows = parseRows(md);
// Lone dollar amounts shouldn't open math spans and merge cells.
expect(rows[1].map((c) => c.content)).toEqual(["$5", "$10"]);
});
});
describe("backslash escapes outside code spans", () => {
it("still honors \\| as an escaped cell-pipe", () => {
const md = ["| A | B |", "| --- | --- |", "| x | a\\|b |", ""].join("\n");
const rows = parseRows(md);
expect(rows[1]).toHaveLength(2);
expect(rows[1][1].content).toContain("|");
});
});
});
+366
View File
@@ -1,4 +1,5 @@
import type MarkdownIt from "markdown-it";
import type StateBlock from "markdown-it/lib/rules_block/state_block.mjs";
const BREAK_REGEX = /(?<=^|[^\\])\\n/;
const BR_TAG_REGEX = /<br\s*\/?>/gi;
@@ -6,7 +7,372 @@ const BR_TAG_REGEX = /<br\s*\/?>/gi;
// Stops at <br> or newline to handle multiple checkboxes in a cell
const CHECKBOX_REGEX = /^(?:-\s*)?\[(X|\s|_|-)\]\s([^<\n]*)?/i;
const TAB = 0x09;
const SPACE = 0x20;
const DOLLAR = 0x24;
const BACKSLASH = 0x5c;
const BACKTICK = 0x60;
const PIPE = 0x7c;
const HYPHEN = 0x2d;
const COLON = 0x3a;
const DIGIT_0 = 0x30;
const DIGIT_9 = 0x39;
const MAX_AUTOCOMPLETED_CELLS = 0x10000;
function isSpace(code: number): boolean {
return code === SPACE || code === TAB;
}
/**
* Skip over a backtick-delimited code span starting at `pos`.
*
* @returns the position just past the closing run if a balanced span exists,
* otherwise -1 (indicating the backticks at `pos` are literal text).
*/
function skipCodeSpan(str: string, pos: number): number {
const max = str.length;
const runStart = pos;
while (pos < max && str.charCodeAt(pos) === BACKTICK) {
pos++;
}
const tickCount = pos - runStart;
let scan = pos;
while (scan < max) {
if (str.charCodeAt(scan) !== BACKTICK) {
scan++;
continue;
}
const closeStart = scan;
while (scan < max && str.charCodeAt(scan) === BACKTICK) {
scan++;
}
if (scan - closeStart === tickCount) {
return scan;
}
}
return -1;
}
/**
* Skip over an inline $...$ math span starting at `pos`.
*
* Mirrors the opener/closer constraints used by the math inline rule so that
* literal dollar amounts (e.g. "$5") do not accidentally consume a cell.
*
* @returns the position just past the closing `$` if a balanced span exists,
* otherwise -1.
*/
function skipMathSpan(str: string, pos: number): number {
const max = str.length;
// Opener: next char must not be whitespace.
const next = pos + 1 < max ? str.charCodeAt(pos + 1) : -1;
if (next === -1 || isSpace(next)) {
return -1;
}
let scan = pos + 1;
while (scan < max) {
const ch = str.charCodeAt(scan);
if (ch === BACKSLASH) {
// Skip escaped char.
scan += 2;
continue;
}
if (ch === DOLLAR) {
const prev = str.charCodeAt(scan - 1);
const after = scan + 1 < max ? str.charCodeAt(scan + 1) : -1;
// Closer: prev not whitespace, next not a digit.
if (!isSpace(prev) && !(after >= DIGIT_0 && after <= DIGIT_9)) {
return scan + 1;
}
}
scan++;
}
return -1;
}
/**
* Split a table row on unescaped pipes, while ignoring pipes that fall inside
* inline code spans or inline math spans. This is a superset of markdown-it's
* default behaviour, which only honours `\|` escapes and so silently truncates
* cells whose content contains literal `|` characters inside `` `...` `` or
* `$...$`.
*/
function escapedSplit(str: string): string[] {
const result: string[] = [];
const max = str.length;
let pos = 0;
let isEscaped = false;
let lastPos = 0;
let current = "";
while (pos < max) {
const ch = str.charCodeAt(pos);
if (!isEscaped && ch === BACKTICK) {
const end = skipCodeSpan(str, pos);
if (end !== -1) {
pos = end;
isEscaped = false;
continue;
}
} else if (!isEscaped && ch === DOLLAR) {
const end = skipMathSpan(str, pos);
if (end !== -1) {
pos = end;
isEscaped = false;
continue;
}
}
if (ch === PIPE) {
if (!isEscaped) {
result.push(current + str.substring(lastPos, pos));
current = "";
lastPos = pos + 1;
} else {
// Escaped pipe `\|` becomes part of the current cell, dropping the
// leading backslash.
current += str.substring(lastPos, pos - 1);
lastPos = pos;
}
}
isEscaped = ch === BACKSLASH;
pos++;
}
result.push(current + str.substring(lastPos));
return result;
}
function getLine(state: StateBlock, line: number): string {
const pos = state.bMarks[line] + state.tShift[line];
const max = state.eMarks[line];
return state.src.slice(pos, max);
}
/**
* GFM table block rule, forked from markdown-it. The only behavioural change
* from the upstream rule is that {@link escapedSplit} also recognises
* backtick-delimited code spans and `$...$` math spans, so pipes inside such
* spans no longer split the row.
*/
function tableRule(
state: StateBlock,
startLine: number,
endLine: number,
silent: boolean
): boolean {
if (startLine + 2 > endLine) {
return false;
}
let nextLine = startLine + 1;
if (state.sCount[nextLine] < state.blkIndent) {
return false;
}
if (state.sCount[nextLine] - state.blkIndent >= 4) {
return false;
}
let pos = state.bMarks[nextLine] + state.tShift[nextLine];
if (pos >= state.eMarks[nextLine]) {
return false;
}
const firstCh = state.src.charCodeAt(pos++);
if (firstCh !== PIPE && firstCh !== HYPHEN && firstCh !== COLON) {
return false;
}
if (pos >= state.eMarks[nextLine]) {
return false;
}
const secondCh = state.src.charCodeAt(pos++);
if (
secondCh !== PIPE &&
secondCh !== HYPHEN &&
secondCh !== COLON &&
!isSpace(secondCh)
) {
return false;
}
if (firstCh === HYPHEN && isSpace(secondCh)) {
return false;
}
while (pos < state.eMarks[nextLine]) {
const ch = state.src.charCodeAt(pos);
if (ch !== PIPE && ch !== HYPHEN && ch !== COLON && !isSpace(ch)) {
return false;
}
pos++;
}
let lineText = getLine(state, startLine + 1);
let columns = lineText.split("|");
const aligns: string[] = [];
for (let i = 0; i < columns.length; i++) {
const t = columns[i].trim();
if (!t) {
if (i === 0 || i === columns.length - 1) {
continue;
} else {
return false;
}
}
if (!/^:?-+:?$/.test(t)) {
return false;
}
if (t.charCodeAt(t.length - 1) === COLON) {
aligns.push(t.charCodeAt(0) === COLON ? "center" : "right");
} else if (t.charCodeAt(0) === COLON) {
aligns.push("left");
} else {
aligns.push("");
}
}
lineText = getLine(state, startLine).trim();
if (lineText.indexOf("|") === -1) {
return false;
}
if (state.sCount[startLine] - state.blkIndent >= 4) {
return false;
}
columns = escapedSplit(lineText);
if (columns.length && columns[0] === "") {
columns.shift();
}
if (columns.length && columns[columns.length - 1] === "") {
columns.pop();
}
const columnCount = columns.length;
if (columnCount === 0 || columnCount !== aligns.length) {
return false;
}
if (silent) {
return true;
}
const oldParentType = state.parentType;
// The StateBlock type only knows the standard parent-type strings. Casting
// keeps us in lockstep with the upstream rule.
state.parentType = "table" as typeof state.parentType;
const terminatorRules = state.md.block.ruler.getRules("blockquote");
const tableTokenOpen = state.push("table_open", "table", 1);
const tableLines: [number, number] = [startLine, 0];
tableTokenOpen.map = tableLines;
const tHeadOpen = state.push("thead_open", "thead", 1);
tHeadOpen.map = [startLine, startLine + 1];
const headerRowOpen = state.push("tr_open", "tr", 1);
headerRowOpen.map = [startLine, startLine + 1];
for (let i = 0; i < columns.length; i++) {
const thOpen = state.push("th_open", "th", 1);
if (aligns[i]) {
thOpen.attrs = [["style", "text-align:" + aligns[i]]];
}
const inlineToken = state.push("inline", "", 0);
inlineToken.content = columns[i].trim();
inlineToken.children = [];
state.push("th_close", "th", -1);
}
state.push("tr_close", "tr", -1);
state.push("thead_close", "thead", -1);
let tBodyLines: [number, number] | undefined;
let autocompletedCells = 0;
for (nextLine = startLine + 2; nextLine < endLine; nextLine++) {
if (state.sCount[nextLine] < state.blkIndent) {
break;
}
let terminate = false;
for (let i = 0, l = terminatorRules.length; i < l; i++) {
if (terminatorRules[i](state, nextLine, endLine, true)) {
terminate = true;
break;
}
}
if (terminate) {
break;
}
lineText = getLine(state, nextLine).trim();
if (!lineText) {
break;
}
if (state.sCount[nextLine] - state.blkIndent >= 4) {
break;
}
columns = escapedSplit(lineText);
if (columns.length && columns[0] === "") {
columns.shift();
}
if (columns.length && columns[columns.length - 1] === "") {
columns.pop();
}
autocompletedCells += columnCount - columns.length;
if (autocompletedCells > MAX_AUTOCOMPLETED_CELLS) {
break;
}
if (nextLine === startLine + 2) {
const tBodyOpen = state.push("tbody_open", "tbody", 1);
tBodyLines = [startLine + 2, 0];
tBodyOpen.map = tBodyLines;
}
const rowOpen = state.push("tr_open", "tr", 1);
rowOpen.map = [nextLine, nextLine + 1];
for (let i = 0; i < columnCount; i++) {
const tdOpen = state.push("td_open", "td", 1);
if (aligns[i]) {
tdOpen.attrs = [["style", "text-align:" + aligns[i]]];
}
const inlineToken = state.push("inline", "", 0);
inlineToken.content = columns[i] ? columns[i].trim() : "";
inlineToken.children = [];
state.push("td_close", "td", -1);
}
state.push("tr_close", "tr", -1);
}
if (tBodyLines) {
state.push("tbody_close", "tbody", -1);
tBodyLines[1] = nextLine;
}
state.push("table_close", "table", -1);
tableLines[1] = nextLine;
state.parentType = oldParentType;
state.line = nextLine;
return true;
}
export default function markdownTables(md: MarkdownIt): void {
// Replace the built-in GFM table rule with one that ignores pipes inside
// inline code and math spans when splitting cells. Without this, content
// such as `` `|a-b|` `` would silently split a row into extra cells and
// truncate the document on round-trip.
md.block.ruler.at("table", tableRule, { alt: ["paragraph", "reference"] });
// insert a new rule after the "inline" rules are parsed
md.core.ruler.after("inline", "tables-pm", (state) => {
const tokens = state.tokens;