Files
outline/server/queues/tasks/MarkdownAPIImportTask.ts
T
Tom Moor ee5164290d perf: Move Markdown importer to zip stream (#12372)
* perf: Move Markdown importer to zip stream

* refactor

* refactor: Extract zip walk + tree builder into ZipHelper

Adds `ZipHelper.walk` and `ZipHelper.toFileTree` so other importers can
stream zip contents without extracting to disk. Tree construction uses
an O(1) path → node map; `./`-prefixed entries are normalized, while
dotfiles, `__MACOSX`, and `..` segments are filtered.

* PR feedback
2026-05-18 18:32:58 -04:00

651 lines
22 KiB
TypeScript

import path from "node:path";
import { randomUUID } from "node:crypto";
import { escapeRegExp } from "es-toolkit/compat";
import mime from "mime-types";
import { UniqueConstraintError } from "sequelize";
import { DocumentValidation } from "@shared/validations";
import type {
ImportTaskInput,
ImportTaskOutput,
MarkdownAttachmentManifestItem,
MarkdownPageImportTaskInputItem,
} from "@shared/schema";
import type { IntegrationService, ProsemirrorDoc } from "@shared/types";
import { AttachmentPreset } from "@shared/types";
import attachmentCreator from "@server/commands/attachmentCreator";
import { createContext } from "@server/context";
import env from "@server/env";
import Logger from "@server/logging/Logger";
import type { ImportTask } from "@server/models";
import { Attachment } from "@server/models";
import AttachmentHelper, {
Buckets,
} from "@server/models/helpers/AttachmentHelper";
import { ProsemirrorHelper } from "@server/models/helpers/ProsemirrorHelper";
import { sequelize } from "@server/storage/database";
import FileStorage from "@server/storage/files";
import type { ZipTreeNode } from "@server/utils/ZipHelper";
import ZipHelper from "@server/utils/ZipHelper";
import type { ProcessOutput } from "./APIImportTask";
import APIImportTask from "./APIImportTask";
import { DocumentConverter } from "@server/utils/DocumentConverter";
type Markdown = IntegrationService.Markdown;
interface DiscoveredDocument {
id: string;
title: string;
pathInZip: string;
collectionId: string;
parentDocumentId?: string;
markdownText: string;
children: DiscoveredDocument[];
}
interface DiscoveredCollection {
id: string;
title: string;
children: DiscoveredDocument[];
}
interface AttachmentRef {
id: string;
pathInZip: string;
}
/**
* Rewrites local attachment paths in markdown text into `<<attachmentId>>`
* placeholders. Supports legacy bucket layouts (`uploads/`, `public/`),
* arbitrary nested folder names, and `./attachments/...` rooted paths. Both
* encoded and unencoded path forms are matched.
*
* Exported for tests; not part of the module's public surface.
*
* @param markdown The raw markdown text from a single document.
* @param attachments Attachment manifest entries to substitute.
* @returns Markdown text with local paths replaced by `<<id>>` references.
*/
export function rewriteAttachmentPaths(
markdown: string,
attachments: AttachmentRef[]
): string {
let text = markdown;
for (const attachment of attachments) {
const encodedPath = encodeURI(attachment.pathInZip);
const attachmentFileName = path.basename(attachment.pathInZip);
const reference = `<<${attachment.id}>>`;
const normalizedAttachmentPath = encodedPath
.replace(new RegExp(`(.*)/${Buckets.uploads}/`), `${Buckets.uploads}/`)
.replace(new RegExp(`(.*)/${Buckets.public}/`), `${Buckets.public}/`);
const attachmentDir = path.basename(path.dirname(attachment.pathInZip));
const genericNormalizedPath = `${attachmentDir}/${encodeURI(attachmentFileName)}`;
text = text
.replace(new RegExp(escapeRegExp(encodedPath), "g"), reference)
.replace(
new RegExp(`\\.?/?${escapeRegExp(normalizedAttachmentPath)}`, "g"),
reference
);
const segments = attachment.pathInZip.split(path.sep);
const attachmentsIdx = segments.findIndex(
(seg) => seg.toLowerCase() === "attachments"
);
if (attachmentsIdx >= 0) {
const relFromAttachments = segments.slice(attachmentsIdx).join("/");
text = text.replace(
new RegExp(`\\.?/?${escapeRegExp(encodeURI(relFromAttachments))}`, "g"),
reference
);
}
text = text.replace(
new RegExp(`\\.?/?${escapeRegExp(genericNormalizedPath)}`, "g"),
reference
);
}
return text;
}
/**
* Rewrites internal markdown links (`[label](./relative.md)`) into
* `<<documentId>>` placeholders, resolved against a path → id map built from
* the zip's full document tree.
*
* Exported for tests; not part of the module's public surface.
*
* @param markdown The raw markdown text from a single document.
* @param documentPath Zip-relative path of the document being rewritten
* (e.g. `Collection/parent.md`); used as the base for
* resolving relative link targets against docMap keys.
* @param docMap Map of document path (as it appeared in the zip) to its
* pre-assigned externalId.
* @returns Markdown text with internal `.md` link targets replaced by
* `<<id>>` references.
*/
export function rewriteInternalLinks(
markdown: string,
documentPath: string,
docMap: Record<string, string>
): string {
const basePath = path.dirname(documentPath);
const internalLinks = [...markdown.matchAll(/\[[^\]]+\]\(([^)]+\.md)\)/g)];
let text = markdown;
for (const match of internalLinks) {
const referredDocPath = match[1];
const normalizedDocPath = decodeURI(
path.normalize(`${basePath}/${referredDocPath}`)
);
const referredDocId = docMap[normalizedDocPath];
if (referredDocId) {
text = text.replace(referredDocPath, `<<${referredDocId}>>`);
}
}
return text;
}
export default class MarkdownAPIImportTask extends APIImportTask<Markdown> {
protected shouldUploadAttachmentsPerPage(): boolean {
return false;
}
protected async scheduleNextTask(importTask: ImportTask<Markdown>) {
await new MarkdownAPIImportTask().schedule({ importTaskId: importTask.id });
}
protected async onAllTasksCompleted(
lastImportTask: ImportTask<Markdown>
): Promise<void> {
const scratch = lastImportTask.import.scratch;
if (!scratch?.storageKey || !scratch.manifest?.length) {
return;
}
const handle = await FileStorage.getFileHandle(scratch.storageKey);
try {
const createdBy = lastImportTask.import.createdBy;
const manifestByPath = new Map<string, MarkdownAttachmentManifestItem>(
scratch.manifest.map((item) => [item.pathInZip, item])
);
const maxAttachmentSize = AttachmentHelper.presetToMaxUploadSize(
AttachmentPreset.DocumentAttachment
);
const seen = new Set<string>();
await ZipHelper.walk(handle.path, async (entry) => {
if (entry.isDirectory) {
return;
}
// Normalize to match the bootstrap-phase pathInZip (segments rejoined
// with `/`, no leading `./` or empty segments).
const normalized = entry.fileName.split("/").filter(Boolean).join("/");
const item = manifestByPath.get(normalized);
if (!item) {
return;
}
seen.add(item.pathInZip);
const buffer = await entry.readBuffer(maxAttachmentSize);
try {
await sequelize.transaction(async (transaction) =>
attachmentCreator({
source: "import",
preset: AttachmentPreset.DocumentAttachment,
id: item.id,
name: item.name,
type: item.mimeType,
buffer,
user: createdBy,
ctx: createContext({ user: createdBy, transaction }),
fetchOptions: {
timeout: env.FILE_STORAGE_IMPORT_TIMEOUT,
},
})
);
} catch (err) {
// Each attachment commits in its own transaction, so a retry of
// this hook can re-encounter ids that already landed. Treat the
// unique-id collision as a no-op so the import remains resumable.
if (err instanceof UniqueConstraintError) {
return;
}
throw err;
}
});
for (const item of scratch.manifest) {
if (!seen.has(item.pathInZip)) {
Logger.warn(
`Markdown import attachment missing in zip, skipping: ${item.pathInZip}`
);
}
}
} finally {
await handle.cleanup().catch(() => {});
}
}
protected async processBootstrap(
importTask: ImportTask<Markdown>
): Promise<ProcessOutput<Markdown>> {
const storageKey = importTask.import.scratch?.storageKey;
if (!storageKey) {
throw new Error("Markdown import is missing scratch.storageKey");
}
const handle = await FileStorage.getFileHandle(storageKey);
try {
// Side map of pre-loaded markdown text keyed by tree node identity.
// ZipHelper's tree carries only metadata; we capture text during the
// single walk so the downstream pass doesn't have to re-open the zip.
const markdownByNode = new Map<ZipTreeNode, string>();
const tree = await ZipHelper.toFileTree(
handle.path,
async (node, entry) => {
const ext = path.extname(node.name).toLowerCase();
if (ext === ".md" || ext === ".markdown") {
const buffer = await entry.readBuffer(
DocumentValidation.maxStateLength
);
markdownByNode.set(node, buffer.toString("utf8"));
}
}
);
if (tree.children.length === 0) {
throw new Error("Could not find valid content in zip file");
}
const collections: DiscoveredCollection[] = [];
const manifest: MarkdownAttachmentManifestItem[] = [];
for (const node of tree.children) {
if (node.children.length === 0) {
Logger.debug("task", `Unhandled file in zip: ${node.pathInZip}`, {
importTaskId: importTask.id,
});
continue;
}
if (this.isAttachmentFolder(node)) {
this.collectAttachments(node, manifest);
continue;
}
const collection: DiscoveredCollection = {
id: randomUUID(),
title: node.title,
children: [],
};
collections.push(collection);
this.collectDocumentsAndAttachments({
children: node.children,
collectionId: collection.id,
out: collection.children,
manifest,
markdownByNode,
});
}
// Build docMap (pathInZip -> externalId) for internal-link resolution.
// Walk the full document tree to collect every doc id, since internal
// markdown links can target any document regardless of depth.
const docMap: Record<string, string> = {};
const collectDocMap = (docs: DiscoveredDocument[]) => {
for (const d of docs) {
docMap[d.pathInZip] = d.id;
collectDocMap(d.children);
}
};
for (const c of collections) {
collectDocMap(c.children);
}
// Replace (not append) anything past the create-time placeholder with
// the freshly discovered collections so a retried bootstrap doesn't
// accumulate duplicate entries with fresh UUIDs from a previous
// partial run. ImportsProcessor's persistence pass treats these as
// collections.
const associatedImport = importTask.import;
const placeholder = associatedImport.input[0];
associatedImport.input = [
placeholder,
...collections.map((c) => ({
externalId: c.id,
permission: placeholder.permission,
})),
];
associatedImport.scratch = { storageKey, manifest };
await associatedImport.save();
// Append collection placeholder items so ImportsProcessor iterates
// them during the bootstrap row (the earliest createdAt) — that
// guarantees collections land in the DB before any per-page document
// references them.
const collectionInputItems: MarkdownPageImportTaskInputItem[] =
collections.map((c) => ({
externalId: c.id,
title: c.title,
path: c.title,
markdownText: "",
attachmentMap: [],
docMap: {},
}));
importTask.input = [importTask.input[0], ...collectionInputItems];
const collectionOutputs: ImportTaskOutput = collections.map((c) => ({
externalId: c.id,
title: c.title,
content: ProsemirrorHelper.getEmptyDocument() as ProsemirrorDoc,
}));
// First wave of document tasks: only top-level docs in each collection.
// Each carries its descendants in `children` and the per-page handler
// re-emits them as the next wave of childTasksInput, producing a strict
// depth-ordered cascade of ImportTask rows so parent FKs are always
// satisfied at child-doc creation time.
const childTasksInput: ImportTaskInput<Markdown> = collections.flatMap(
(c) => c.children.map((d) => this.toPageInput(d, manifest, docMap))
);
return { taskOutput: collectionOutputs, childTasksInput };
} finally {
await handle.cleanup().catch(() => {});
}
}
/**
* Converts a discovered document subtree into a per-page task input,
* recursively packing the doc's descendants into the `children` field so
* each tree-depth runs as its own task wave.
*
* @param doc The discovered document, including its descendants.
* @param manifest The full attachment manifest (used for per-page refs).
* @param docMap Path → externalId map for internal link rewriting.
* @returns A self-contained per-page task input.
*/
private toPageInput(
doc: DiscoveredDocument,
manifest: MarkdownAttachmentManifestItem[],
docMap: Record<string, string>
): MarkdownPageImportTaskInputItem {
return {
externalId: doc.id,
parentExternalId: doc.parentDocumentId,
collectionExternalId: doc.collectionId,
title: doc.title,
path: doc.pathInZip,
markdownText: doc.markdownText,
attachmentMap: this.attachmentsReferencedBy(doc.markdownText, manifest),
docMap,
children: doc.children.length
? doc.children.map((c) => this.toPageInput(c, manifest, docMap))
: undefined,
};
}
protected async processPage(
importTask: ImportTask<Markdown>
): Promise<ProcessOutput<Markdown>> {
const taskOutput: ImportTaskOutput = [];
const childTasksInput: MarkdownPageImportTaskInputItem[] = [];
const items = importTask.input as MarkdownPageImportTaskInputItem[];
for (const item of items) {
// Empty markdown short-circuits — used by collection placeholders so
// ImportsProcessor sees their externalId paired with empty content and
// builds a Collection rather than a Document. (Currently collections
// are persisted via the bootstrap task itself, so this branch is only
// a defensive fallback.)
if (!item.markdownText) {
taskOutput.push({
externalId: item.externalId,
title: item.title,
content: ProsemirrorHelper.getEmptyDocument() as ProsemirrorDoc,
});
} else {
const transformedMarkdown = this.rewriteMarkdown(item);
const { doc, title, icon } = await DocumentConverter.convert(
transformedMarkdown,
path.basename(item.path),
"text/markdown"
);
taskOutput.push({
externalId: item.externalId,
title: title || item.title,
icon,
content: doc.toJSON() as ProsemirrorDoc,
});
}
// Cascade this doc's direct descendants as the next task wave. Their
// ImportTask rows will be created after the current one returns, so
// their createdAt is strictly later — guaranteeing parent-before-child
// FK ordering during ImportsProcessor's persistence pass.
if (item.children?.length) {
childTasksInput.push(...item.children);
}
}
return { taskOutput, childTasksInput };
}
/**
* Pre-rewrites a page's markdown text. Internal `.md` links become mention
* markdown so the editor parses them as Document mentions. Attachment paths
* are first reduced to `<<id>>` placeholders by the shared rewriter, then
* — distinct from the prosemirror-tree walk we used to do — substituted
* with their final attachment redirect URLs in the markdown text. Doing
* the resolution at the text layer avoids markdown-it parsing `<<id>>` as
* an angle-bracket-wrapped URL (which produced broken image src attrs).
*
* @param page The per-page task input.
* @returns Rewritten markdown text ready for DocumentConverter.
*/
private rewriteMarkdown(page: MarkdownPageImportTaskInputItem): string {
let text = rewriteInternalLinks(page.markdownText, page.path, page.docMap);
// Convert `[label](<<id>>)` links from rewriteInternalLinks into mention
// markdown the editor recognises: `@[label](mention://<uuid>/document/<id>)`.
text = text.replace(
/\[([^\]]+)\]\(<<([^>]+)>>\)/g,
(_full, label: string, externalId: string) =>
`@[${label}](mention://${randomUUID()}/document/${externalId})`
);
text = rewriteAttachmentPaths(
text,
page.attachmentMap.map((m) => ({ id: m.id, pathInZip: m.pathInZip }))
);
// Resolve remaining `<<id>>` placeholders to attachment redirect URLs.
text = text.replace(/<<([^>]+)>>/g, (_full, id: string) =>
Attachment.getRedirectUrl(id)
);
return text;
}
/**
* Returns the subset of the full manifest that is referenced anywhere in
* the given markdown text. Used to bound the per-page task input size.
*
* @param markdown Raw markdown text for a single document.
* @param manifest The full attachment manifest from the bootstrap phase.
* @returns Manifest entries that appear (by filename) in the markdown.
*/
private attachmentsReferencedBy(
markdown: string,
manifest: MarkdownAttachmentManifestItem[]
): MarkdownAttachmentManifestItem[] {
return manifest.filter((item) => {
const fileName = path.basename(item.pathInZip);
return (
markdown.includes(fileName) || markdown.includes(encodeURI(fileName))
);
});
}
/**
* Detects folders containing only attachments (no markdown documents).
* Recursively considers nested folders; mirrors the legacy heuristic.
*
* @param node ZipTreeNode to inspect.
* @returns true when the folder appears to hold only attachments.
*/
private isAttachmentFolder(node: ZipTreeNode): boolean {
if (node.children.length === 0) {
return false;
}
if (node.title.toLowerCase() === "attachments") {
return true;
}
return node.children.every((child) => {
if (child.children.length > 0) {
return this.isAttachmentFolder(child);
}
const ext = path.extname(child.name).toLowerCase();
if (!ext) {
return false;
}
return ext !== ".md" && ext !== ".markdown";
});
}
/**
* Recursively collects all files under an attachment-only folder into the
* manifest. `pathInZip` is stored verbatim so the completion phase can find
* the same entry when re-walking the archive.
*
* @param node Attachment-folder ZipTreeNode.
* @param manifest Manifest array to push entries into.
*/
private collectAttachments(
node: ZipTreeNode,
manifest: MarkdownAttachmentManifestItem[]
): void {
for (const child of node.children) {
if (child.children.length > 0) {
this.collectAttachments(child, manifest);
continue;
}
manifest.push({
id: randomUUID(),
name: child.name,
pathInZip: child.pathInZip,
mimeType: mime.lookup(child.name) || "application/octet-stream",
});
}
}
/**
* Walks a collection subtree and gathers documents (markdown files) and
* loose attachments. Documents are appended to `out` as a tree — each
* entry's `children` holds its direct descendants. This is the shape the
* per-page task cascade consumes.
*
* @param children ZipTreeNode children of the current folder.
* @param collectionId Pre-assigned id of the enclosing collection.
* @param parentDocumentId Optional parent document id when nested.
* @param out Sibling accumulator to push discovered documents into.
* @param manifest Attachment manifest accumulator.
* @param markdownByNode Pre-loaded markdown text keyed by tree node.
*/
private collectDocumentsAndAttachments({
children,
collectionId,
parentDocumentId,
out,
manifest,
markdownByNode,
}: {
children: ZipTreeNode[];
collectionId: string;
parentDocumentId?: string;
out: DiscoveredDocument[];
manifest: MarkdownAttachmentManifestItem[];
markdownByNode: Map<ZipTreeNode, string>;
}): void {
for (const child of children) {
if (child.children.length > 0 && this.isAttachmentFolder(child)) {
this.collectAttachments(child, manifest);
continue;
}
const ext = path.extname(child.name).toLowerCase();
const isMarkdown = ext === ".md" || ext === ".markdown";
const isFolder = child.children.length > 0;
if (!isMarkdown && !isFolder) {
manifest.push({
id: randomUUID(),
name: child.name,
pathInZip: child.pathInZip,
mimeType: mime.lookup(child.name) || "application/octet-stream",
});
continue;
}
const id = randomUUID();
const markdownText = isFolder ? "" : (markdownByNode.get(child) ?? "");
// Folder-and-file with the same title (a "name.md" alongside a "name/"
// directory) is merged onto a single document: the folder body picks up
// the file's markdown text, and the folder's contents become children.
const sibling = out.find((d) => d.title === child.title);
if (sibling) {
if (sibling.markdownText === "" && markdownText) {
sibling.markdownText = markdownText;
}
if (isFolder) {
this.collectDocumentsAndAttachments({
children: child.children,
collectionId,
parentDocumentId: sibling.id,
out: sibling.children,
manifest,
markdownByNode,
});
}
continue;
}
const node: DiscoveredDocument = {
id,
title: child.title,
pathInZip: child.pathInZip,
collectionId,
parentDocumentId,
markdownText,
children: [],
};
out.push(node);
if (isFolder) {
this.collectDocumentsAndAttachments({
children: child.children,
collectionId,
parentDocumentId: id,
out: node.children,
manifest,
markdownByNode,
});
}
}
}
}