Files
outline/server/utils/DocumentConverter.ts
Tom Moor d0ede882c6 perf: More memory improvements (#12539)
* perf: Lazy import mailparser, @fast-csv, and franc deps

Moves heavy dependencies off the startup path into the narrow async code
paths that actually use them, mirroring the mammoth lazy-import change:

- mailparser: only needed for Confluence Word imports (confluenceToHtml)
- @fast-csv/parse: only needed for CSV imports (csvToMarkdown)
- franc / iso-639-3: only needed by the DocumentUpdateText worker task

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

* perf: Lazy import jsdom dep

jsdom is one of the heaviest server dependencies but is only needed for
HTML export (ProsemirrorHelper.toHTML) and HTML import
(DocumentConverter.htmlToProsemirror). Move it to a lazy `await import`
inside those methods so its dependency tree stays off the startup path.

Both methods become async; all callers were already in async contexts.
The type-only usage in patchGlobalEnv is now an `import type`.
2026-05-30 17:31:04 -04:00

574 lines
18 KiB
TypeScript

import { escapeRegExp } from "es-toolkit/compat";
import type { Node } from "prosemirror-model";
import { DOMParser as ProsemirrorDOMParser } from "prosemirror-model";
import yaml from "js-yaml";
import { schema, serializer } from "@server/editor";
import { FileImportError } from "@server/errors";
import { trace, traceFunction } from "@server/logging/tracing";
import { ProsemirrorHelper } from "@server/models/helpers/ProsemirrorHelper";
export interface ConvertResult {
/** The document content as markdown text. */
text: string;
/** The document content as Prosemirror. */
doc: Node;
/** The extracted title (from H1 heading if present). */
title: string;
/** The extracted emoji/icon from start of document. */
icon?: string;
}
@trace()
export class DocumentConverter {
/**
* Convert an incoming file to a structured document result.
*
* @param content The content of the file.
* @param fileName The name of the file, including extension.
* @param mimeType The mime type of the file.
* @returns The converted document with text, data, title, and icon.
*/
public static async convert(
content: Buffer | string,
fileName: string,
mimeType: string
): Promise<ConvertResult> {
let doc: Node;
// Route to appropriate conversion method
const html = await this.convertToHtml(content, fileName, mimeType);
if (html !== undefined) {
doc = await this.htmlToProsemirror(html);
} else {
const markdown = await this.convertToMarkdown(
content,
fileName,
mimeType
);
doc = ProsemirrorHelper.toProsemirror(markdown);
}
// Extract title from first H1 heading
let title = "";
const headings = ProsemirrorHelper.getHeadings(doc);
if (headings.length > 0 && headings[0].level === 1) {
title = headings[0].title;
doc = ProsemirrorHelper.removeFirstHeading(doc);
}
// Extract emoji from start of document
const { emoji: icon, doc: docWithoutEmoji } =
ProsemirrorHelper.extractEmojiFromStart(doc);
doc = docWithoutEmoji;
// Serialize to markdown and trim whitespace
const text = serializer.serialize(doc).trim();
return {
text,
doc,
title,
icon,
};
}
/**
* Convert HTML content directly to a Prosemirror document node.
*
* @param content The HTML content as a string or Buffer.
* @returns A Prosemirror Node representing the document.
*/
public static async htmlToProsemirror(
content: Buffer | string
): Promise<Node> {
if (typeof content !== "string") {
content = content.toString("utf8");
}
// Loaded lazily to keep jsdom off the startup path — only HTML imports need it.
const { JSDOM } = await import("jsdom");
const dom = new JSDOM(content);
const document = dom.window.document;
// Remove problematic elements before parsing
const elementsToRemove = document.querySelectorAll(
"script, style, title, head, meta, link"
);
elementsToRemove.forEach((el) => el.remove());
// Preprocess the DOM to handle edge cases
this.preprocessHtmlForImport(document);
// Patch global environment for Prosemirror DOMParser
const cleanup = ProsemirrorHelper.patchGlobalEnv(dom.window);
try {
const domParser = ProsemirrorDOMParser.fromSchema(schema);
return domParser.parse(document.body);
} finally {
cleanup();
}
}
/**
* Preprocesses HTML DOM before Prosemirror parsing to cleanup
* images and other elements.
*
* @param document The DOM document to preprocess.
*/
private static preprocessHtmlForImport(document: Document): void {
// Handle images: filter emoticons, remove Jira icons, apply Confluence sizing
const images = document.querySelectorAll("img");
images.forEach((img) => {
const className = img.className || "";
// Skip emoticon images (they'll be dropped)
if (className.includes("emoticon")) {
img.remove();
return;
}
// Remove Jira icon images
if (
className === "icon" &&
img.parentElement?.className.includes("jira-issue-key")
) {
img.remove();
return;
}
// Handle Confluence image sizing: data-width/data-height → width/height
const dataWidth = img.getAttribute("data-width");
const dataHeight = img.getAttribute("data-height");
const width = img.getAttribute("width");
if (dataWidth && dataHeight && width) {
const ratio = parseInt(dataWidth) / parseInt(width);
const calculatedHeight = Math.round(parseInt(dataHeight) / ratio);
img.setAttribute("height", String(calculatedHeight));
}
// Extract dimensions from data URI images that lack width/height
// (e.g. images embedded by mammoth during docx import).
// Only decode a small prefix of the base64 data — headers for all
// supported formats live within the first 64 KB of the file.
if (!img.getAttribute("width") && !img.getAttribute("height")) {
const src = img.getAttribute("src") || "";
if (src.startsWith("data:") && src.includes(";base64,")) {
const base64Start = src.indexOf(";base64,") + 8;
// 4 base64 chars → 3 bytes; decode at most ~64 KB of image data.
const maxBase64Chars = Math.ceil(65536 / 3) * 4;
const base64Prefix = src.slice(
base64Start,
base64Start + maxBase64Chars
);
const dimensions = this.getImageDimensionsFromBuffer(
Buffer.from(base64Prefix, "base64")
);
if (dimensions) {
img.setAttribute("width", String(dimensions.width));
img.setAttribute("height", String(dimensions.height));
}
}
}
});
}
/**
* Attempts to convert content to HTML for formats that support it.
* Returns undefined for formats that should be parsed as markdown directly.
*
* @param content The content of the file.
* @param fileName The name of the file, including extension.
* @param mimeType The mime type of the file.
* @returns HTML string if convertible, undefined otherwise.
*/
private static async convertToHtml(
content: Buffer | string,
fileName: string,
mimeType: string
): Promise<string | undefined> {
// First try to convert based on the mime type
switch (mimeType) {
case "text/html":
return typeof content === "string" ? content : content.toString("utf8");
case "application/msword":
return this.confluenceToHtml(content);
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return this.docxToHtml(content);
default:
break;
}
// Try to convert based on the file extension
const extension = fileName.split(".").pop();
switch (extension) {
case "html":
return typeof content === "string" ? content : content.toString("utf8");
case "docx":
return this.docxToHtml(content);
default:
return undefined;
}
}
/**
* Converts content to markdown for text-based formats.
*
* @param content The content of the file.
* @param fileName The name of the file, including extension.
* @param mimeType The mime type of the file.
* @returns Markdown string.
*/
private static async convertToMarkdown(
content: Buffer | string,
fileName: string,
mimeType: string
): Promise<string> {
let markdown: string;
switch (mimeType) {
case "text/plain":
case "text/markdown":
markdown = this.bufferToString(content);
break;
case "text/csv":
return this.csvToMarkdown(content);
default: {
const extension = fileName.split(".").pop();
switch (extension) {
case "md":
case "markdown":
markdown = this.bufferToString(content);
break;
default:
throw FileImportError(`File type ${mimeType} not supported`);
}
}
}
// Process frontmatter and convert it to a YAML codeblock
return this.processFrontmatter(markdown);
}
/**
* Convert a docx file to HTML using mammoth.
*
* @param content The docx file content as a Buffer.
* @returns The HTML representation of the document.
*/
private static async docxToHtml(content: Buffer | string): Promise<string> {
if (content instanceof Buffer) {
// Loaded lazily to keep mammoth off the startup path — only docx imports need it.
const mammoth = (await import("mammoth")).default;
const { value } = await traceFunction({ spanName: "convertToHtml" })(
mammoth.convertToHtml
)({
buffer: content,
});
return value;
}
throw FileImportError("Unsupported Word file");
}
/**
* Convert a Confluence Word export to HTML.
*
* @param content The Confluence Word export content.
* @returns The HTML representation of the document.
*/
private static async confluenceToHtml(
content: Buffer | string
): Promise<string> {
if (typeof content !== "string") {
content = content.toString("utf8");
}
// We're only supporting the output from Confluence here, regular Word documents should call
// into the docxToHtml importer. See: https://jira.atlassian.com/browse/CONFSERVER-38237
if (!content.includes("Content-Type: multipart/related")) {
throw FileImportError("Unsupported Word file");
}
// Confluence "Word" documents are actually just multi-part email messages, so we can use
// mailparser to parse the content. Loaded lazily to keep mailparser off the startup path —
// only Confluence Word imports need it.
const { simpleParser } = await import("mailparser");
const parsed = await simpleParser(content);
if (!parsed.html) {
throw FileImportError("Unsupported Word file (No content found)");
}
let html = parsed.html;
// Replace the content-location with a data URI for each attachment.
for (const attachment of parsed.attachments) {
const contentLocation =
(attachment.headers.get("content-location") as string | undefined) ??
"";
const id = contentLocation.split("/").pop();
if (!id) {
continue;
}
html = html.replace(
new RegExp(escapeRegExp(id), "g"),
`data:image/png;base64,${attachment.content.toString("base64")}`
);
}
return html;
}
/**
* Convert a CSV file to a markdown table.
*
* @param content The CSV file content.
* @returns A markdown table representation.
*/
private static async csvToMarkdown(
content: Buffer | string
): Promise<string> {
// Loaded lazily to keep @fast-csv off the startup path — only CSV imports need it.
const { parse } = await import("@fast-csv/parse");
return new Promise((resolve, reject) => {
const text = this.bufferToString(content).trim();
const textLines = text.split("\n");
// Find the first non-empty line to determine the delimiter
const firstNonEmptyLine =
textLines.find((line) => line.trim().length > 0) || "";
// Determine the separator used in the CSV file based on number of occurrences of each separator on first line
const delimiter = [";", ",", "\t"].reduce(
(acc, separator) => {
const count = (
firstNonEmptyLine.match(new RegExp(escapeRegExp(separator), "g")) ||
[]
).length;
return count > acc.count ? { count, separator } : acc;
},
{ count: 0, separator: "," }
).separator;
const lines: string[][] = [];
const stream = parse({ delimiter })
.on("error", (error) => {
reject(
FileImportError(`There was an error parsing the CSV file: ${error}`)
);
})
.on("data", (row) => lines.push(row))
.on("end", () => {
// Filter out completely empty rows
const nonEmptyLines = lines.filter((row) =>
row.some((cell) => cell.trim() !== "")
);
if (nonEmptyLines.length === 0) {
resolve("");
return;
}
// Check if all rows have a trailing empty cell (trailing comma artifact)
// Only trim if ALL non-empty rows end with an empty cell
let trimmedLines = nonEmptyLines;
while (
trimmedLines.length > 0 &&
trimmedLines.every(
(row) => row.length > 0 && row[row.length - 1].trim() === ""
)
) {
trimmedLines = trimmedLines.map((row) => row.slice(0, -1));
}
// Find the most common column count
const columnCounts = new Map<number, number>();
for (const row of trimmedLines) {
if (row.length > 0) {
columnCounts.set(
row.length,
(columnCounts.get(row.length) || 0) + 1
);
}
}
// Get the column count that appears most frequently
let expectedColumns = 0;
let maxFrequency = 0;
for (const [count, frequency] of columnCounts) {
if (frequency > maxFrequency) {
maxFrequency = frequency;
expectedColumns = count;
}
}
// Find the first row with the expected column count (this is the header)
const headerIndex = trimmedLines.findIndex(
(row) => row.length === expectedColumns
);
if (headerIndex === -1) {
resolve("");
return;
}
const headers = trimmedLines[headerIndex];
const dataRows = trimmedLines
.slice(headerIndex + 1)
.filter((row) => row.length === expectedColumns);
const table = dataRows
.map((cells) => `| ${cells.join(" | ")} |`)
.join("\n");
const headerLine = `| ${headers.join(" | ")} |`;
const separatorLine = `| ${headers.map(() => "---").join(" | ")} |`;
resolve(`${headerLine}\n${separatorLine}\n${table}\n`);
});
stream.write(text);
stream.end();
});
}
/**
* Convert a Buffer to a string.
*
* @param content The content as a Buffer or string.
* @returns The content as a string.
*/
private static bufferToString(content: Buffer | string): string {
return typeof content === "string" ? content : content.toString("utf8");
}
/**
* Parse and convert frontmatter to a YAML codeblock.
*
* @param content The markdown content that may contain frontmatter.
* @returns The markdown content with frontmatter converted to a YAML codeblock.
*/
private static processFrontmatter(content: string): string {
// Frontmatter must start at the beginning of the document
const frontmatterRegex = /^---\n([\s\S]*?)\n---(?:\n|$)/;
const match = content.match(frontmatterRegex);
if (!match) {
return content;
}
const frontmatterContent = match[1];
const remainingContent = content.slice(match[0].length);
// Validate that the frontmatter is valid YAML
try {
yaml.load(frontmatterContent);
} catch {
// If it's not valid YAML, return content unchanged
return content;
}
// Convert frontmatter to a YAML codeblock
const codeBlockDelimiter = "```";
const yamlCodeblock = `${codeBlockDelimiter}yaml\n${frontmatterContent}\n${codeBlockDelimiter}\n\n`;
return yamlCodeblock + remainingContent;
}
/**
* Parse image dimensions from a binary buffer. Supports PNG, JPEG, and GIF.
*
* @param buffer The image data.
* @returns The width and height if parseable, otherwise undefined.
*/
private static getImageDimensionsFromBuffer(
buffer: Buffer
): { width: number; height: number } | undefined {
try {
// PNG: signature + IHDR chunk
if (
buffer.length >= 24 &&
buffer[0] === 0x89 &&
buffer[1] === 0x50 &&
buffer[2] === 0x4e &&
buffer[3] === 0x47
) {
return {
width: buffer.readUInt32BE(16),
height: buffer.readUInt32BE(20),
};
}
// GIF: signature + logical screen descriptor
if (
buffer.length >= 10 &&
buffer[0] === 0x47 &&
buffer[1] === 0x49 &&
buffer[2] === 0x46
) {
return {
width: buffer.readUInt16LE(6),
height: buffer.readUInt16LE(8),
};
}
// JPEG: scan for SOF marker (cap at 64 KB to bound work)
if (buffer.length >= 2 && buffer[0] === 0xff && buffer[1] === 0xd8) {
const scanLimit = Math.min(buffer.length, 65536);
let offset = 2;
while (offset + 1 < scanLimit) {
if (buffer[offset] !== 0xff) {
offset++;
continue;
}
const marker = buffer[offset + 1];
offset += 2;
// Standalone markers without a payload
if (
marker === 0x00 ||
marker === 0x01 ||
(marker >= 0xd0 && marker <= 0xd9)
) {
continue;
}
if (offset + 2 > scanLimit) {
break;
}
const segmentLength = buffer.readUInt16BE(offset);
// SOF markers contain the frame dimensions — check before
// the advance guard since this returns immediately.
if (
(marker >= 0xc0 && marker <= 0xc3) ||
(marker >= 0xc5 && marker <= 0xc7) ||
(marker >= 0xc9 && marker <= 0xcb) ||
(marker >= 0xcd && marker <= 0xcf)
) {
if (offset + 7 <= buffer.length) {
return {
height: buffer.readUInt16BE(offset + 3),
width: buffer.readUInt16BE(offset + 5),
};
}
break;
}
// Length includes itself and must be >= 2; bail on malformed data.
if (segmentLength < 2 || offset + segmentLength > buffer.length) {
break;
}
offset += segmentLength;
}
}
} catch {
// Return undefined if parsing fails
}
return undefined;
}
}