Words separated by hyphens to be treated as a single unit for word diffing (#11272)

* fix: hyphenated word diff

* Add tests, simplify, reduce gap allowance

* tsc

* simplify

---------

Co-authored-by: Tom Moor <tom@getoutline.com>
This commit is contained in:
Apoorv Mishra
2026-05-31 03:41:14 +05:30
committed by GitHub
parent 224230eaa0
commit b2bb2335a1
3 changed files with 1204 additions and 1 deletions
@@ -13,6 +13,38 @@ export interface Example {
* Node and mark names are matched against those defined in the shared editor schema.
*/
export const examples: Example[] = [
{
id: "word-replacement",
name: "Word replacement (interleaved)",
before: {
type: "doc",
content: [
{
type: "paragraph",
content: [
{
type: "text",
text: "no-await-in-loop",
},
],
},
],
},
after: {
type: "doc",
content: [
{
type: "paragraph",
content: [
{
type: "text",
text: "jsx-no-jsx-as-prop",
},
],
},
],
},
},
{
id: "simple-text",
name: "Simple text change",
@@ -632,6 +664,846 @@ export const examples: Example[] = [
],
},
},
{
id: "tc",
name: "More table changes",
before: {
type: "doc",
content: [
{
type: "heading",
attrs: {
level: 1,
},
content: [
{
text: "Perf:",
type: "text",
},
],
},
{
type: "paragraph",
content: [
{
text: "Code that can be written to run faster.",
type: "text",
},
],
},
{
type: "table",
content: [
{
type: "tr",
content: [
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "Rule name",
type: "text",
marks: [
{
type: "strong",
attrs: {},
},
],
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "Source",
type: "text",
marks: [
{
type: "strong",
attrs: {},
},
],
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "Default",
type: "text",
marks: [
{
type: "strong",
attrs: {},
},
],
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "Fixable?",
type: "text",
marks: [
{
type: "strong",
attrs: {},
},
],
},
],
},
],
},
],
},
{
type: "tr",
content: [
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "no-await-in-loop",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "eslint",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
},
],
},
],
},
{
type: "tr",
content: [
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "prefer-set-has",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "unicorn",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "\ud83d\udee0\ufe0f",
type: "text",
},
],
},
],
},
],
},
{
type: "tr",
content: [
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "no-map-spread",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "oxc",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "\ud83d\udee0\ufe0f\ud83d\udca1",
type: "text",
},
],
},
],
},
],
},
{
type: "tr",
content: [
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "no-array-index-key",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "react",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
},
],
},
],
},
],
},
{
type: "paragraph",
},
],
},
after: {
type: "doc",
content: [
{
type: "heading",
attrs: {
level: 1,
},
content: [
{
text: "Perf:",
type: "text",
},
],
},
{
type: "paragraph",
content: [
{
text: "Code that can be written to run faster.",
type: "text",
},
],
},
{
type: "table",
content: [
{
type: "tr",
content: [
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "Rule name",
type: "text",
marks: [
{
type: "strong",
attrs: {},
},
],
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "Source",
type: "text",
marks: [
{
type: "strong",
attrs: {},
},
],
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "Default",
type: "text",
marks: [
{
type: "strong",
attrs: {},
},
],
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "Fixable?",
type: "text",
marks: [
{
type: "strong",
attrs: {},
},
],
},
],
},
],
},
],
},
{
type: "tr",
content: [
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "jsx-no-jsx-as-prop",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "react-perf",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
},
],
},
],
},
{
type: "tr",
content: [
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "no-map-spread",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "oxc",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "\ud83d\udee0\ufe0f",
type: "text",
},
{
type: "emoji",
attrs: {
"data-name": "bulb",
},
},
],
},
],
},
],
},
{
type: "tr",
content: [
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "prefer-array-find",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "unicorn",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
type: "emoji",
attrs: {
"data-name": "construction",
},
},
],
},
],
},
],
},
{
type: "tr",
content: [
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "prefer-set-has",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
text: "unicorn",
type: "text",
},
],
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
},
],
},
{
type: "td",
attrs: {
colspan: 1,
rowspan: 1,
},
content: [
{
type: "paragraph",
content: [
{
type: "emoji",
attrs: {
"data-name": "warning",
},
},
{
type: "emoji",
attrs: {
"data-name": "hammer_and_wrench",
},
},
],
},
],
},
],
},
],
},
{
type: "paragraph",
},
],
},
},
{
id: "table-add-row",
name: "Table: add row",
+178
View File
@@ -0,0 +1,178 @@
import type { Slice } from "prosemirror-model";
import { ChangesetHelper, type ExtendedChange } from "./ChangesetHelper";
import type { ProsemirrorData } from "../../types";
/**
* Builds a single-paragraph document from the given text.
*/
function para(text: string): ProsemirrorData {
return {
type: "doc",
content: [{ type: "paragraph", content: [{ type: "text", text }] }],
};
}
/**
* Builds a document with one paragraph per provided text.
*/
function paras(...texts: string[]): ProsemirrorData {
return {
type: "doc",
content: texts.map((text) => ({
type: "paragraph",
content: [{ type: "text", text }],
})),
};
}
/**
* Computes the changeset between two documents, asserts it is non-null, and
* returns the resulting changes.
*/
function changesFor(
after: ProsemirrorData,
before: ProsemirrorData
): readonly ExtendedChange[] {
const result = ChangesetHelper.getChangeset(after, before);
expect(result).not.toBeNull();
return result!.changes;
}
/**
* Concatenates the text content of every deletion slice in a change.
*/
function deletedText(change: ExtendedChange): string {
return change.deleted
.map((deletion) => {
const { slice } = deletion.data as { slice: Slice | null };
return slice ? slice.content.textBetween(0, slice.content.size) : "";
})
.join("");
}
describe("ChangesetHelper.getChangeset", () => {
it("returns null when there is no previous revision to compare against", () => {
expect(ChangesetHelper.getChangeset(para("Hello"), null)).toBeNull();
expect(ChangesetHelper.getChangeset(null, para("Hello"))).toBeNull();
});
describe("interleaved word-diff merging", () => {
it("merges a hyphenated word replacement into a single change", () => {
// Word-level diffing splits "no-await-in-loop" -> "jsx-no-jsx-as-prop"
// into several interleaved delete/insert changes around the hyphens.
// These should be merged back into one change covering the whole word.
const changes = changesFor(
para("jsx-no-jsx-as-prop"),
para("no-await-in-loop")
);
expect(changes).toHaveLength(1);
const [change] = changes;
expect(change.fromA).toBe(1);
expect(change.toA).toBe(17); // length of "no-await-in-loop" + 1 (doc offset)
expect(change.fromB).toBe(1);
expect(change.toB).toBe(19); // length of "jsx-no-jsx-as-prop" + 1
expect(change.deleted).toHaveLength(1);
expect(change.inserted).toHaveLength(1);
});
it("captures the full original word in the merged deletion", () => {
const changes = changesFor(
para("jsx-no-jsx-as-prop"),
para("no-await-in-loop")
);
expect(deletedText(changes[0])).toBe("no-await-in-loop");
});
});
describe("does not over-merge unrelated changes", () => {
it("keeps edits in separate nodes separate", () => {
// Two single-character replacements in two different paragraphs sit
// close together positionally, but the gap between them crosses a
// paragraph boundary and must not be merged.
const changes = changesFor(paras("c", "d"), paras("a", "b"));
expect(changes).toHaveLength(2);
expect(deletedText(changes[0])).toBe("a");
expect(deletedText(changes[1])).toBe("b");
});
it("keeps edits separated by a large unchanged gap separate", () => {
// "quick" and "fox" are replaced, but the unchanged " brown " between
// them exceeds the merge gap threshold, so they stay distinct.
const changes = changesFor(
para("The slow brown lazy"),
para("The quick brown fox")
);
expect(changes).toHaveLength(2);
expect(deletedText(changes[0])).toBe("quick");
expect(deletedText(changes[1])).toBe("fox");
});
it("does not merge a cluster of pure insertions", () => {
// Inserting text on both sides of the unchanged "a" produces two pure
// insertions a short gap apart. Merging them would render the unchanged
// "a" as inserted, so the window (no deletion) must not merge.
const changes = changesFor(para("foo a bar"), para("a"));
expect(changes).toHaveLength(2);
// Both are pure insertions — nothing is marked as deleted.
expect(changes.every((change) => change.deleted.length === 0)).toBe(true);
});
it("does not merge a cluster of pure deletions", () => {
// Deleting text on both sides of the unchanged "a" produces two pure
// deletions a short gap apart. Merging them would render the unchanged
// "a" as deleted, so the window (no insertion) must not merge.
const changes = changesFor(para("a"), para("foo a bar"));
expect(changes).toHaveLength(2);
// The unchanged "a" is not absorbed into either deletion.
expect(deletedText(changes[0])).toBe("foo ");
expect(deletedText(changes[1])).toBe(" bar");
});
});
describe("gap merge threshold", () => {
// Two word replacements ("cat"->"fox", "dog"->"pig") separated by an
// unchanged middle word. The gap between the two changes equals the
// middle length plus its two surrounding spaces. mergeInterleavedChanges
// merges them only while that gap is <= MAX_UNCHANGED_GAP (3).
//
// The trade-off these cases document: when the gap is small enough to
// merge, the unchanged middle word is absorbed into the deletion and
// therefore rendered as deleted + reinserted.
it("merges across a gap of 3, absorbing the unchanged middle", () => {
// " a " => gap of 3 (1 char + 2 spaces), the threshold.
const changes = changesFor(para("fox a pig"), para("cat a dog"));
expect(changes).toHaveLength(1);
// The unchanged "a" is swallowed into the merged deletion.
expect(deletedText(changes[0])).toBe("cat a dog");
});
it("does not merge across a gap of 4", () => {
// " ab " => gap of 4 (2 chars + 2 spaces), just past the threshold.
const changes = changesFor(para("fox ab pig"), para("cat ab dog"));
expect(changes).toHaveLength(2);
// The two edits stay distinct and the middle "ab" is untouched.
expect(deletedText(changes[0])).toBe("cat");
expect(deletedText(changes[1])).toBe("dog");
});
});
it("does not affect a simple single-word change", () => {
const changes = changesFor(
para("Hello modified world"),
para("Hello world")
);
expect(changes).toHaveLength(1);
expect(changes[0].inserted).toHaveLength(1);
});
});
+154 -1
View File
@@ -8,6 +8,155 @@ import { recreateTransform } from "./prosemirror-recreate-transform";
import { richExtensions, withComments } from "../nodes";
import type { ProsemirrorData } from "../../types";
/**
* The structural subset of a changeset `Change` that this module reads and
* produces. Using a `Pick` rather than the `Change` class avoids requiring
* class-only members (such as `toJSON`) on the plain objects we build.
*/
type ChangeFields<T> = Pick<
Change<T>,
"fromA" | "toA" | "fromB" | "toB" | "deleted" | "inserted"
>;
/**
* The maximum number of unchanged characters allowed between two adjacent
* changes for them to still be merged into a single change. This is
* deliberately small: merging absorbs the gap into the diff, so any unchanged
* text within it is rendered as deleted/reinserted. A value of 3 is the
* minimum that rejoins hyphenated word fragments such as replacing
* "no-await-in-loop", where the differ aligns the shared "no" and leaves an
* unchanged "no-" (gap of 3) between fragments, without swallowing longer
* unchanged words.
*/
const MAX_UNCHANGED_GAP = 3;
/**
* Merges adjacent Change objects that represent interleaved deletions/insertions.
*
* When word-level diffing is used, replacing "no-await-in-loop" with
* "jsx-no-jsx-as-prop" can produce multiple adjacent Change objects like:
* Change1: delete "no", insert "jsx"
* Change2: delete "await", insert "no"
* ...
*
* This function merges such adjacent changes into a single change:
* Change: delete "no-await-in-loop", insert "jsx-no-jsx-as-prop"
*
* @param changes - The changes from simplifyChanges
* @param docOld - The old document (to extract merged deletion content)
* @returns Changes with adjacent interleaved changes merged
*/
function mergeInterleavedChanges<T extends { step: Step; slice: Slice | null }>(
changes: readonly ChangeFields<T>[],
docOld: Node
): ChangeFields<T>[] {
if (changes.length <= 1) {
return [...changes];
}
const result: ChangeFields<T>[] = [];
let i = 0;
while (i < changes.length) {
const current = changes[i];
// Check if this change and subsequent changes form a contiguous replacement
// (i.e., they're adjacent in both old and new document positions)
let j = i + 1;
while (j < changes.length) {
const prev = changes[j - 1];
const next = changes[j];
// Check if changes are adjacent (toA of prev equals fromA of next, same for B)
// Allow gaps (like hyphens or other unchanged characters between changes)
const gapA = next.fromA - prev.toA;
const gapB = next.fromB - prev.toB;
// Check if changes are in the same parent node (e.g., same table cell, same paragraph)
// by verifying that the gap in the old document doesn't cross node boundaries
let crossesNodeBoundary = false;
if (gapA > 0) {
try {
// If the gap contains any non-text node, it crosses a boundary
const gap = docOld.slice(prev.toA, next.fromA).content;
for (let n = 0; n < gap.childCount; n++) {
if (!gap.child(n).isText) {
crossesNodeBoundary = true;
break;
}
}
} catch {
crossesNodeBoundary = true;
}
}
// If gaps are equal, reasonably small, and don't cross node boundaries,
// they're part of the same logical replacement
if (
gapA === gapB &&
gapA >= 0 &&
gapA <= MAX_UNCHANGED_GAP &&
!crossesNodeBoundary
) {
j++;
} else {
break;
}
}
// The merged change only needs the first deletion/insertion in the window
// to carry forward the originating step; the spans themselves are
// recomputed from the window bounds below.
let firstDeleted: { length: number; data: T } | undefined;
let firstInserted: { length: number; data: T } | undefined;
for (let k = i; k < j; k++) {
firstDeleted ??= changes[k].deleted[0];
firstInserted ??= changes[k].inserted[0];
}
// Merge the window only when it is a genuine replacement — it must contain
// both a deletion and an insertion. Otherwise a cluster of pure insertions
// (or pure deletions) separated by a short unchanged gap would merge and
// render the unchanged text between them as inserted/deleted.
if (j > i + 1 && firstDeleted && firstInserted) {
const lastChange = changes[j - 1];
// Create merged change. The deletion slice holds the original (old) text
// spanning the whole window so it renders as one block; it is not treated
// as a modification downstream because its text differs from the insertion.
const mergedChange: ChangeFields<T> = {
fromA: current.fromA,
toA: lastChange.toA,
fromB: current.fromB,
toB: lastChange.toB,
deleted: [
{
length: lastChange.toA - current.fromA,
data: {
...firstDeleted.data,
slice: docOld.slice(current.fromA, lastChange.toA),
} as T,
},
],
inserted: [
{
length: lastChange.toB - current.fromB,
data: firstInserted.data,
},
],
};
result.push(mergedChange);
i = j;
} else {
result.push(current);
i++;
}
}
return result;
}
/**
* Represents a modification (attribute change) in the document.
*/
@@ -147,7 +296,11 @@ export class ChangesetHelper {
}))
);
let changes = simplifyChanges(changeset.changes, docNew);
// Merge interleaved deletions/insertions into cleaner blocks
const changes = mergeInterleavedChanges(
simplifyChanges(changeset.changes, docNew),
docOld
);
// Post-process changes to detect modifications (attribute-only changes)
const extendedChanges: ExtendedChange[] = changes.map((change) => {