mirror of
https://github.com/outline/outline.git
synced 2026-06-13 03:14:59 +03:00
Words separated by hyphens to be treated as a single unit for word diffing (#11272)
* fix: hyphenated word diff * Add tests, simplify, reduce gap allowance * tsc * simplify --------- Co-authored-by: Tom Moor <tom@getoutline.com>
This commit is contained in:
@@ -0,0 +1,178 @@
|
||||
import type { Slice } from "prosemirror-model";
|
||||
import { ChangesetHelper, type ExtendedChange } from "./ChangesetHelper";
|
||||
import type { ProsemirrorData } from "../../types";
|
||||
|
||||
/**
|
||||
* Builds a single-paragraph document from the given text.
|
||||
*/
|
||||
function para(text: string): ProsemirrorData {
|
||||
return {
|
||||
type: "doc",
|
||||
content: [{ type: "paragraph", content: [{ type: "text", text }] }],
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a document with one paragraph per provided text.
|
||||
*/
|
||||
function paras(...texts: string[]): ProsemirrorData {
|
||||
return {
|
||||
type: "doc",
|
||||
content: texts.map((text) => ({
|
||||
type: "paragraph",
|
||||
content: [{ type: "text", text }],
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the changeset between two documents, asserts it is non-null, and
|
||||
* returns the resulting changes.
|
||||
*/
|
||||
function changesFor(
|
||||
after: ProsemirrorData,
|
||||
before: ProsemirrorData
|
||||
): readonly ExtendedChange[] {
|
||||
const result = ChangesetHelper.getChangeset(after, before);
|
||||
expect(result).not.toBeNull();
|
||||
return result!.changes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Concatenates the text content of every deletion slice in a change.
|
||||
*/
|
||||
function deletedText(change: ExtendedChange): string {
|
||||
return change.deleted
|
||||
.map((deletion) => {
|
||||
const { slice } = deletion.data as { slice: Slice | null };
|
||||
return slice ? slice.content.textBetween(0, slice.content.size) : "";
|
||||
})
|
||||
.join("");
|
||||
}
|
||||
|
||||
describe("ChangesetHelper.getChangeset", () => {
|
||||
it("returns null when there is no previous revision to compare against", () => {
|
||||
expect(ChangesetHelper.getChangeset(para("Hello"), null)).toBeNull();
|
||||
expect(ChangesetHelper.getChangeset(null, para("Hello"))).toBeNull();
|
||||
});
|
||||
|
||||
describe("interleaved word-diff merging", () => {
|
||||
it("merges a hyphenated word replacement into a single change", () => {
|
||||
// Word-level diffing splits "no-await-in-loop" -> "jsx-no-jsx-as-prop"
|
||||
// into several interleaved delete/insert changes around the hyphens.
|
||||
// These should be merged back into one change covering the whole word.
|
||||
const changes = changesFor(
|
||||
para("jsx-no-jsx-as-prop"),
|
||||
para("no-await-in-loop")
|
||||
);
|
||||
|
||||
expect(changes).toHaveLength(1);
|
||||
|
||||
const [change] = changes;
|
||||
expect(change.fromA).toBe(1);
|
||||
expect(change.toA).toBe(17); // length of "no-await-in-loop" + 1 (doc offset)
|
||||
expect(change.fromB).toBe(1);
|
||||
expect(change.toB).toBe(19); // length of "jsx-no-jsx-as-prop" + 1
|
||||
expect(change.deleted).toHaveLength(1);
|
||||
expect(change.inserted).toHaveLength(1);
|
||||
});
|
||||
|
||||
it("captures the full original word in the merged deletion", () => {
|
||||
const changes = changesFor(
|
||||
para("jsx-no-jsx-as-prop"),
|
||||
para("no-await-in-loop")
|
||||
);
|
||||
|
||||
expect(deletedText(changes[0])).toBe("no-await-in-loop");
|
||||
});
|
||||
});
|
||||
|
||||
describe("does not over-merge unrelated changes", () => {
|
||||
it("keeps edits in separate nodes separate", () => {
|
||||
// Two single-character replacements in two different paragraphs sit
|
||||
// close together positionally, but the gap between them crosses a
|
||||
// paragraph boundary and must not be merged.
|
||||
const changes = changesFor(paras("c", "d"), paras("a", "b"));
|
||||
|
||||
expect(changes).toHaveLength(2);
|
||||
expect(deletedText(changes[0])).toBe("a");
|
||||
expect(deletedText(changes[1])).toBe("b");
|
||||
});
|
||||
|
||||
it("keeps edits separated by a large unchanged gap separate", () => {
|
||||
// "quick" and "fox" are replaced, but the unchanged " brown " between
|
||||
// them exceeds the merge gap threshold, so they stay distinct.
|
||||
const changes = changesFor(
|
||||
para("The slow brown lazy"),
|
||||
para("The quick brown fox")
|
||||
);
|
||||
|
||||
expect(changes).toHaveLength(2);
|
||||
expect(deletedText(changes[0])).toBe("quick");
|
||||
expect(deletedText(changes[1])).toBe("fox");
|
||||
});
|
||||
|
||||
it("does not merge a cluster of pure insertions", () => {
|
||||
// Inserting text on both sides of the unchanged "a" produces two pure
|
||||
// insertions a short gap apart. Merging them would render the unchanged
|
||||
// "a" as inserted, so the window (no deletion) must not merge.
|
||||
const changes = changesFor(para("foo a bar"), para("a"));
|
||||
|
||||
expect(changes).toHaveLength(2);
|
||||
// Both are pure insertions — nothing is marked as deleted.
|
||||
expect(changes.every((change) => change.deleted.length === 0)).toBe(true);
|
||||
});
|
||||
|
||||
it("does not merge a cluster of pure deletions", () => {
|
||||
// Deleting text on both sides of the unchanged "a" produces two pure
|
||||
// deletions a short gap apart. Merging them would render the unchanged
|
||||
// "a" as deleted, so the window (no insertion) must not merge.
|
||||
const changes = changesFor(para("a"), para("foo a bar"));
|
||||
|
||||
expect(changes).toHaveLength(2);
|
||||
// The unchanged "a" is not absorbed into either deletion.
|
||||
expect(deletedText(changes[0])).toBe("foo ");
|
||||
expect(deletedText(changes[1])).toBe(" bar");
|
||||
});
|
||||
});
|
||||
|
||||
describe("gap merge threshold", () => {
|
||||
// Two word replacements ("cat"->"fox", "dog"->"pig") separated by an
|
||||
// unchanged middle word. The gap between the two changes equals the
|
||||
// middle length plus its two surrounding spaces. mergeInterleavedChanges
|
||||
// merges them only while that gap is <= MAX_UNCHANGED_GAP (3).
|
||||
//
|
||||
// The trade-off these cases document: when the gap is small enough to
|
||||
// merge, the unchanged middle word is absorbed into the deletion and
|
||||
// therefore rendered as deleted + reinserted.
|
||||
|
||||
it("merges across a gap of 3, absorbing the unchanged middle", () => {
|
||||
// " a " => gap of 3 (1 char + 2 spaces), the threshold.
|
||||
const changes = changesFor(para("fox a pig"), para("cat a dog"));
|
||||
|
||||
expect(changes).toHaveLength(1);
|
||||
// The unchanged "a" is swallowed into the merged deletion.
|
||||
expect(deletedText(changes[0])).toBe("cat a dog");
|
||||
});
|
||||
|
||||
it("does not merge across a gap of 4", () => {
|
||||
// " ab " => gap of 4 (2 chars + 2 spaces), just past the threshold.
|
||||
const changes = changesFor(para("fox ab pig"), para("cat ab dog"));
|
||||
|
||||
expect(changes).toHaveLength(2);
|
||||
// The two edits stay distinct and the middle "ab" is untouched.
|
||||
expect(deletedText(changes[0])).toBe("cat");
|
||||
expect(deletedText(changes[1])).toBe("dog");
|
||||
});
|
||||
});
|
||||
|
||||
it("does not affect a simple single-word change", () => {
|
||||
const changes = changesFor(
|
||||
para("Hello modified world"),
|
||||
para("Hello world")
|
||||
);
|
||||
|
||||
expect(changes).toHaveLength(1);
|
||||
expect(changes[0].inserted).toHaveLength(1);
|
||||
});
|
||||
});
|
||||
@@ -8,6 +8,155 @@ import { recreateTransform } from "./prosemirror-recreate-transform";
|
||||
import { richExtensions, withComments } from "../nodes";
|
||||
import type { ProsemirrorData } from "../../types";
|
||||
|
||||
/**
|
||||
* The structural subset of a changeset `Change` that this module reads and
|
||||
* produces. Using a `Pick` rather than the `Change` class avoids requiring
|
||||
* class-only members (such as `toJSON`) on the plain objects we build.
|
||||
*/
|
||||
type ChangeFields<T> = Pick<
|
||||
Change<T>,
|
||||
"fromA" | "toA" | "fromB" | "toB" | "deleted" | "inserted"
|
||||
>;
|
||||
|
||||
/**
|
||||
* The maximum number of unchanged characters allowed between two adjacent
|
||||
* changes for them to still be merged into a single change. This is
|
||||
* deliberately small: merging absorbs the gap into the diff, so any unchanged
|
||||
* text within it is rendered as deleted/reinserted. A value of 3 is the
|
||||
* minimum that rejoins hyphenated word fragments such as replacing
|
||||
* "no-await-in-loop", where the differ aligns the shared "no" and leaves an
|
||||
* unchanged "no-" (gap of 3) between fragments, without swallowing longer
|
||||
* unchanged words.
|
||||
*/
|
||||
const MAX_UNCHANGED_GAP = 3;
|
||||
|
||||
/**
|
||||
* Merges adjacent Change objects that represent interleaved deletions/insertions.
|
||||
*
|
||||
* When word-level diffing is used, replacing "no-await-in-loop" with
|
||||
* "jsx-no-jsx-as-prop" can produce multiple adjacent Change objects like:
|
||||
* Change1: delete "no", insert "jsx"
|
||||
* Change2: delete "await", insert "no"
|
||||
* ...
|
||||
*
|
||||
* This function merges such adjacent changes into a single change:
|
||||
* Change: delete "no-await-in-loop", insert "jsx-no-jsx-as-prop"
|
||||
*
|
||||
* @param changes - The changes from simplifyChanges
|
||||
* @param docOld - The old document (to extract merged deletion content)
|
||||
* @returns Changes with adjacent interleaved changes merged
|
||||
*/
|
||||
function mergeInterleavedChanges<T extends { step: Step; slice: Slice | null }>(
|
||||
changes: readonly ChangeFields<T>[],
|
||||
docOld: Node
|
||||
): ChangeFields<T>[] {
|
||||
if (changes.length <= 1) {
|
||||
return [...changes];
|
||||
}
|
||||
|
||||
const result: ChangeFields<T>[] = [];
|
||||
let i = 0;
|
||||
|
||||
while (i < changes.length) {
|
||||
const current = changes[i];
|
||||
|
||||
// Check if this change and subsequent changes form a contiguous replacement
|
||||
// (i.e., they're adjacent in both old and new document positions)
|
||||
let j = i + 1;
|
||||
while (j < changes.length) {
|
||||
const prev = changes[j - 1];
|
||||
const next = changes[j];
|
||||
|
||||
// Check if changes are adjacent (toA of prev equals fromA of next, same for B)
|
||||
// Allow gaps (like hyphens or other unchanged characters between changes)
|
||||
const gapA = next.fromA - prev.toA;
|
||||
const gapB = next.fromB - prev.toB;
|
||||
|
||||
// Check if changes are in the same parent node (e.g., same table cell, same paragraph)
|
||||
// by verifying that the gap in the old document doesn't cross node boundaries
|
||||
let crossesNodeBoundary = false;
|
||||
if (gapA > 0) {
|
||||
try {
|
||||
// If the gap contains any non-text node, it crosses a boundary
|
||||
const gap = docOld.slice(prev.toA, next.fromA).content;
|
||||
for (let n = 0; n < gap.childCount; n++) {
|
||||
if (!gap.child(n).isText) {
|
||||
crossesNodeBoundary = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
crossesNodeBoundary = true;
|
||||
}
|
||||
}
|
||||
|
||||
// If gaps are equal, reasonably small, and don't cross node boundaries,
|
||||
// they're part of the same logical replacement
|
||||
if (
|
||||
gapA === gapB &&
|
||||
gapA >= 0 &&
|
||||
gapA <= MAX_UNCHANGED_GAP &&
|
||||
!crossesNodeBoundary
|
||||
) {
|
||||
j++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// The merged change only needs the first deletion/insertion in the window
|
||||
// to carry forward the originating step; the spans themselves are
|
||||
// recomputed from the window bounds below.
|
||||
let firstDeleted: { length: number; data: T } | undefined;
|
||||
let firstInserted: { length: number; data: T } | undefined;
|
||||
for (let k = i; k < j; k++) {
|
||||
firstDeleted ??= changes[k].deleted[0];
|
||||
firstInserted ??= changes[k].inserted[0];
|
||||
}
|
||||
|
||||
// Merge the window only when it is a genuine replacement — it must contain
|
||||
// both a deletion and an insertion. Otherwise a cluster of pure insertions
|
||||
// (or pure deletions) separated by a short unchanged gap would merge and
|
||||
// render the unchanged text between them as inserted/deleted.
|
||||
if (j > i + 1 && firstDeleted && firstInserted) {
|
||||
const lastChange = changes[j - 1];
|
||||
|
||||
// Create merged change. The deletion slice holds the original (old) text
|
||||
// spanning the whole window so it renders as one block; it is not treated
|
||||
// as a modification downstream because its text differs from the insertion.
|
||||
const mergedChange: ChangeFields<T> = {
|
||||
fromA: current.fromA,
|
||||
toA: lastChange.toA,
|
||||
fromB: current.fromB,
|
||||
toB: lastChange.toB,
|
||||
deleted: [
|
||||
{
|
||||
length: lastChange.toA - current.fromA,
|
||||
data: {
|
||||
...firstDeleted.data,
|
||||
slice: docOld.slice(current.fromA, lastChange.toA),
|
||||
} as T,
|
||||
},
|
||||
],
|
||||
inserted: [
|
||||
{
|
||||
length: lastChange.toB - current.fromB,
|
||||
data: firstInserted.data,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
result.push(mergedChange);
|
||||
i = j;
|
||||
} else {
|
||||
result.push(current);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a modification (attribute change) in the document.
|
||||
*/
|
||||
@@ -147,7 +296,11 @@ export class ChangesetHelper {
|
||||
}))
|
||||
);
|
||||
|
||||
let changes = simplifyChanges(changeset.changes, docNew);
|
||||
// Merge interleaved deletions/insertions into cleaner blocks
|
||||
const changes = mergeInterleavedChanges(
|
||||
simplifyChanges(changeset.changes, docNew),
|
||||
docOld
|
||||
);
|
||||
|
||||
// Post-process changes to detect modifications (attribute-only changes)
|
||||
const extendedChanges: ExtendedChange[] = changes.map((change) => {
|
||||
|
||||
Reference in New Issue
Block a user