Files
outline/server/utils/DocumentConverter.test.ts
Tom Moor d0ede882c6 perf: More memory improvements (#12539)
* perf: Lazy import mailparser, @fast-csv, and franc deps

Moves heavy dependencies off the startup path into the narrow async code
paths that actually use them, mirroring the mammoth lazy-import change:

- mailparser: only needed for Confluence Word imports (confluenceToHtml)
- @fast-csv/parse: only needed for CSV imports (csvToMarkdown)
- franc / iso-639-3: only needed by the DocumentUpdateText worker task

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

* perf: Lazy import jsdom dep

jsdom is one of the heaviest server dependencies but is only needed for
HTML export (ProsemirrorHelper.toHTML) and HTML import
(DocumentConverter.htmlToProsemirror). Move it to a lazy `await import`
inside those methods so its dependency tree stays off the startup path.

Both methods become async; all callers were already in async contexts.
The type-only usage in patchGlobalEnv is now an `import type`.
2026-05-30 17:31:04 -04:00

504 lines
16 KiB
TypeScript

import { DocumentConverter } from "./DocumentConverter";
describe("DocumentConverter", () => {
describe("convert", () => {
describe("csv", () => {
it("should convert csv to markdown table", async () => {
const csv = `name,age
John,25
Jane,24`;
const result = await DocumentConverter.convert(
csv,
"test.csv",
"text/csv"
);
// CSV is converted to a markdown table
expect(result.text).toContain("| name | age |");
expect(result.text).toContain("John");
expect(result.text).toContain("Jane");
expect(result.title).toEqual("");
});
it("should handle csv with semicolon delimiter", async () => {
const csv = `name;age
John;25
"Joan ""the bone"", Anne";24`;
const result = await DocumentConverter.convert(
csv,
"test.csv",
"text/csv"
);
expect(result.text).toContain("| name | age |");
expect(result.text).toContain("John");
expect(result.text).toContain('Joan "the bone", Anne');
});
it("should handle csv with title row before headers", async () => {
// Some financial exports have a title row before the actual headers
const csv = `"Report for Account"
"Symbol","Name","Value",
"ABC","Test Corp","$100",
"XYZ","Other Inc","$200",`;
const result = await DocumentConverter.convert(
csv,
"test.csv",
"text/csv"
);
// The actual data headers should be used, not the title row
expect(result.text).toContain("| Symbol | Name | Value |");
expect(result.text).toContain("ABC");
expect(result.text).toContain("Test Corp");
expect(result.text).toContain("XYZ");
});
it("should handle csv with trailing comma on each line", async () => {
const csv = `name,age,city,
John,25,NYC,
Jane,24,LA,`;
const result = await DocumentConverter.convert(
csv,
"test.csv",
"text/csv"
);
expect(result.text).toContain("| name | age | city |");
expect(result.text).toContain("John");
expect(result.text).toContain("Jane");
// Should not have trailing empty column
expect(result.text).not.toContain("| city | |");
expect(result.text).not.toContain("| city | |");
});
it("should preserve intentionally empty cells at end of rows", async () => {
const csv = `name,age,city
John,25,NYC
Jane,24,`;
const result = await DocumentConverter.convert(
csv,
"test.csv",
"text/csv"
);
expect(result.text).toContain("| name | age | city |");
expect(result.text).toContain("John");
expect(result.text).toContain("NYC");
// Jane's row should have 3 columns (empty city preserved)
expect(result.text).toMatch(/\| Jane \| 24\s*\|\s*\|/);
});
});
describe("html", () => {
it("should extract title from H1", async () => {
const html = "<h1>My Title</h1><p>Content here</p>";
const result = await DocumentConverter.convert(
html,
"test.html",
"text/html"
);
expect(result.title).toEqual("My Title");
expect(result.text).toContain("Content here");
expect(result.text).not.toContain("My Title");
});
it("should extract emoji from start", async () => {
const html = "<p>🚀 Launch content</p>";
const result = await DocumentConverter.convert(
html,
"test.html",
"text/html"
);
expect(result.icon).toEqual("🚀");
expect(result.text).not.toMatch(/^🚀/);
});
});
describe("markdown", () => {
it("should extract title from H1", async () => {
const md = "# My Title\n\nContent here";
const result = await DocumentConverter.convert(
md,
"test.md",
"text/markdown"
);
expect(result.title).toEqual("My Title");
expect(result.text).toContain("Content here");
expect(result.text).not.toContain("My Title");
});
it("should return empty title when no H1", async () => {
const md = "## Subtitle\n\nContent here";
const result = await DocumentConverter.convert(
md,
"test.md",
"text/markdown"
);
expect(result.title).toEqual("");
expect(result.text).toContain("Subtitle");
});
it("should convert frontmatter to yaml codeblock", async () => {
const md = `---
title: Test Document
date: 2024-01-15
tags: [test, markdown]
---
# My Title
Content after frontmatter`;
const result = await DocumentConverter.convert(
md,
"test.md",
"text/markdown"
);
// Frontmatter should be converted to a YAML codeblock
expect(result.text).toContain("```yaml");
expect(result.text).toContain("title: Test Document");
expect(result.text).toContain("date: 2024-01-15");
expect(result.text).toContain("tags: [test, markdown]");
expect(result.text).toContain("```");
// Content should still be present
expect(result.text).toContain("Content after frontmatter");
// H1 should be extracted as title
expect(result.title).toEqual("My Title");
});
it("should handle markdown without frontmatter", async () => {
const md = "# Title\n\nRegular content";
const result = await DocumentConverter.convert(
md,
"test.md",
"text/markdown"
);
expect(result.title).toEqual("Title");
expect(result.text).toContain("Regular content");
expect(result.text).not.toContain("```yaml");
});
it("should handle frontmatter with no content after", async () => {
const md = `---
title: Only Frontmatter
---`;
const result = await DocumentConverter.convert(
md,
"test.md",
"text/markdown"
);
expect(result.text).toContain("```yaml");
expect(result.text).toContain("title: Only Frontmatter");
expect(result.text).toContain("```");
expect(result.title).toEqual("");
});
it("should not convert incomplete frontmatter", async () => {
const md = `---
title: Test
Content without closing delimiter`;
const result = await DocumentConverter.convert(
md,
"test.md",
"text/markdown"
);
// Should not convert as it's not proper frontmatter
expect(result.text).not.toContain("```yaml");
expect(result.text).toContain("title: Test");
});
it("should not convert frontmatter if not at start", async () => {
const md = `# Title
Some content
---
title: Test
---
More content`;
const result = await DocumentConverter.convert(
md,
"test.md",
"text/markdown"
);
// Should not convert as frontmatter must be at the start
expect(result.text).not.toContain("```yaml");
});
it("should handle invalid YAML in frontmatter", async () => {
const md = `---
invalid: yaml: content: here
---
Content`;
const result = await DocumentConverter.convert(
md,
"test.md",
"text/markdown"
);
// Should not convert invalid YAML
expect(result.text).not.toContain("```yaml");
});
});
});
describe("htmlToProsemirror", () => {
it("should convert basic HTML to Prosemirror", async () => {
const html = "<p>Hello world</p>";
const doc = await DocumentConverter.htmlToProsemirror(html);
expect(doc.type.name).toBe("doc");
expect(doc.content.childCount).toBe(1);
expect(doc.content.child(0).type.name).toBe("paragraph");
expect(doc.content.child(0).textContent).toBe("Hello world");
});
it("should convert HTML with heading", async () => {
const html = "<h1>Title</h1><p>Content</p>";
const doc = await DocumentConverter.htmlToProsemirror(html);
expect(doc.content.childCount).toBe(2);
expect(doc.content.child(0).type.name).toBe("heading");
expect(doc.content.child(0).attrs.level).toBe(1);
expect(doc.content.child(0).textContent).toBe("Title");
expect(doc.content.child(1).type.name).toBe("paragraph");
});
it("should remove script tags", async () => {
const html = "<p>Safe content</p><script>alert('xss')</script>";
const doc = await DocumentConverter.htmlToProsemirror(html);
expect(doc.textContent).toBe("Safe content");
expect(doc.textContent).not.toContain("alert");
});
it("should remove style tags", async () => {
const html = "<style>body { color: red; }</style><p>Content</p>";
const doc = await DocumentConverter.htmlToProsemirror(html);
expect(doc.textContent).toBe("Content");
expect(doc.textContent).not.toContain("color");
});
it("should handle Buffer input", async () => {
const html = Buffer.from("<p>From buffer</p>", "utf8");
const doc = await DocumentConverter.htmlToProsemirror(html);
expect(doc.content.child(0).textContent).toBe("From buffer");
});
it("should convert HTML with lists", async () => {
const html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
const doc = await DocumentConverter.htmlToProsemirror(html);
expect(doc.content.childCount).toBe(1);
expect(doc.content.child(0).type.name).toBe("bullet_list");
expect(doc.content.child(0).content.childCount).toBe(2);
});
it("should convert HTML with bold and italic", async () => {
const html = "<p><strong>Bold</strong> and <em>italic</em></p>";
const doc = await DocumentConverter.htmlToProsemirror(html);
const paragraph = doc.content.child(0);
expect(paragraph.type.name).toBe("paragraph");
// Check that marks are applied
const boldText = paragraph.content.child(0);
expect(boldText.text).toBe("Bold");
expect(boldText.marks.some((m) => m.type.name === "strong")).toBe(true);
const italicText = paragraph.content.child(2);
expect(italicText.text).toBe("italic");
expect(italicText.marks.some((m) => m.type.name === "em")).toBe(true);
});
it("should handle full HTML document", async () => {
const html = `
<!DOCTYPE html>
<html>
<head>
<title>Test</title>
<meta charset="utf-8">
</head>
<body>
<h1>Document Title</h1>
<p>Paragraph content</p>
</body>
</html>
`;
const doc = await DocumentConverter.htmlToProsemirror(html);
expect(doc.content.childCount).toBe(2);
expect(doc.content.child(0).type.name).toBe("heading");
expect(doc.content.child(0).textContent).toBe("Document Title");
expect(doc.content.child(1).type.name).toBe("paragraph");
expect(doc.content.child(1).textContent).toBe("Paragraph content");
});
it("should remove emoticon images", async () => {
const html = `<p>Hello <img class="emoticon" src="smile.png" alt=":)"> world</p>`;
const doc = await DocumentConverter.htmlToProsemirror(html);
// Emoticon image should be removed, text content remains
expect(doc.textContent).not.toContain(":)");
expect(doc.textContent).toContain("Hello");
expect(doc.textContent).toContain("world");
});
it("should remove Jira icon images", async () => {
const html = `
<p>Issue: <span class="jira-issue-key"><img class="icon" src="icon.png">ABC-123</span></p>
`;
const doc = await DocumentConverter.htmlToProsemirror(html);
expect(doc.textContent).toBe("Issue: ABC-123");
});
it("should apply Confluence image sizing", async () => {
const html = `
<p><img src="image.png" data-width="800" data-height="600" width="400"></p>
`;
const doc = await DocumentConverter.htmlToProsemirror(html);
const paragraph = doc.content.child(0);
const image = paragraph.content.child(0);
expect(image.type.name).toBe("image");
expect(image.attrs.width).toBe(400);
expect(image.attrs.height).toBe(300);
});
it("should extract dimensions from PNG data URI images", async () => {
// Minimal 2x3 PNG (IHDR: width=2, height=3)
const pngBuffer = Buffer.alloc(33);
// PNG signature
Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]).copy(
pngBuffer
);
// IHDR chunk length (13 bytes)
pngBuffer.writeUInt32BE(13, 8);
// "IHDR"
Buffer.from("IHDR").copy(pngBuffer, 12);
// Width = 200
pngBuffer.writeUInt32BE(200, 16);
// Height = 150
pngBuffer.writeUInt32BE(150, 20);
const base64 = pngBuffer.toString("base64");
const html = `<p><img src="data:image/png;base64,${base64}"></p>`;
const doc = await DocumentConverter.htmlToProsemirror(html);
const paragraph = doc.content.child(0);
const image = paragraph.content.child(0);
expect(image.type.name).toBe("image");
expect(image.attrs.width).toBe(200);
expect(image.attrs.height).toBe(150);
});
it("should extract dimensions from JPEG data URI images", async () => {
// Minimal JPEG with SOF0 marker
const jpegBuffer = Buffer.alloc(20);
// JPEG SOI marker
jpegBuffer[0] = 0xff;
jpegBuffer[1] = 0xd8;
// SOF0 marker
jpegBuffer[2] = 0xff;
jpegBuffer[3] = 0xc0;
// Segment length
jpegBuffer.writeUInt16BE(17, 4);
// Precision
jpegBuffer[6] = 8;
// Height = 300
jpegBuffer.writeUInt16BE(300, 7);
// Width = 400
jpegBuffer.writeUInt16BE(400, 9);
const base64 = jpegBuffer.toString("base64");
const html = `<p><img src="data:image/jpeg;base64,${base64}"></p>`;
const doc = await DocumentConverter.htmlToProsemirror(html);
const paragraph = doc.content.child(0);
const image = paragraph.content.child(0);
expect(image.type.name).toBe("image");
expect(image.attrs.width).toBe(400);
expect(image.attrs.height).toBe(300);
});
it("should extract dimensions from GIF data URI images", async () => {
// Minimal GIF header
const gifBuffer = Buffer.alloc(10);
// GIF signature
Buffer.from("GIF89a").copy(gifBuffer);
// Width = 320 (little-endian)
gifBuffer.writeUInt16LE(320, 6);
// Height = 240 (little-endian)
gifBuffer.writeUInt16LE(240, 8);
const base64 = gifBuffer.toString("base64");
const html = `<p><img src="data:image/gif;base64,${base64}"></p>`;
const doc = await DocumentConverter.htmlToProsemirror(html);
const paragraph = doc.content.child(0);
const image = paragraph.content.child(0);
expect(image.type.name).toBe("image");
expect(image.attrs.width).toBe(320);
expect(image.attrs.height).toBe(240);
});
it("should not override existing width/height on data URI images", async () => {
// PNG with dimensions 200x150 but HTML attributes say 100x75
const pngBuffer = Buffer.alloc(33);
Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]).copy(
pngBuffer
);
pngBuffer.writeUInt32BE(13, 8);
Buffer.from("IHDR").copy(pngBuffer, 12);
pngBuffer.writeUInt32BE(200, 16);
pngBuffer.writeUInt32BE(150, 20);
const base64 = pngBuffer.toString("base64");
const html = `<p><img src="data:image/png;base64,${base64}" width="100" height="75"></p>`;
const doc = await DocumentConverter.htmlToProsemirror(html);
const paragraph = doc.content.child(0);
const image = paragraph.content.child(0);
expect(image.type.name).toBe("image");
// Should use the HTML attributes, not the parsed dimensions
expect(image.attrs.width).toBe(100);
expect(image.attrs.height).toBe(75);
});
});
});