diff --git a/server/queues/tasks/MarkdownAPIImportTask.ts b/server/queues/tasks/MarkdownAPIImportTask.ts index 5221fa77bf..0f125ed97a 100644 --- a/server/queues/tasks/MarkdownAPIImportTask.ts +++ b/server/queues/tasks/MarkdownAPIImportTask.ts @@ -1,10 +1,9 @@ import path from "node:path"; import { randomUUID } from "node:crypto"; import { escapeRegExp } from "es-toolkit/compat"; -import fs from "fs-extra"; import mime from "mime-types"; import { UniqueConstraintError } from "sequelize"; -import tmp from "tmp"; +import { DocumentValidation } from "@shared/validations"; import type { ImportTaskInput, ImportTaskOutput, @@ -19,12 +18,13 @@ import env from "@server/env"; import Logger from "@server/logging/Logger"; import type { ImportTask } from "@server/models"; import { Attachment } from "@server/models"; -import { Buckets } from "@server/models/helpers/AttachmentHelper"; +import AttachmentHelper, { + Buckets, +} from "@server/models/helpers/AttachmentHelper"; import { ProsemirrorHelper } from "@server/models/helpers/ProsemirrorHelper"; import { sequelize } from "@server/storage/database"; import FileStorage from "@server/storage/files"; -import type { FileTreeNode } from "@server/utils/ImportHelper"; -import ImportHelper from "@server/utils/ImportHelper"; +import type { ZipTreeNode } from "@server/utils/ZipHelper"; import ZipHelper from "@server/utils/ZipHelper"; import type { ProcessOutput } from "./APIImportTask"; import APIImportTask from "./APIImportTask"; @@ -32,11 +32,6 @@ import { DocumentConverter } from "@server/utils/DocumentConverter"; type Markdown = IntegrationService.Markdown; -interface ExtractedZip { - dirPath: string; - cleanup: () => Promise; -} - interface DiscoveredDocument { id: string; title: string; @@ -173,25 +168,31 @@ export default class MarkdownAPIImportTask extends APIImportTask { return; } - const { dirPath, cleanup } = await this.downloadAndExtract( - scratch.storageKey - ); + const handle = await FileStorage.getFileHandle(scratch.storageKey); try { const createdBy = lastImportTask.import.createdBy; + const manifestByPath = new Map( + scratch.manifest.map((item) => [item.pathInZip, item]) + ); + const maxAttachmentSize = AttachmentHelper.presetToMaxUploadSize( + AttachmentPreset.DocumentAttachment + ); + const seen = new Set(); - for (const item of scratch.manifest) { - const filePath = path.join(dirPath, item.pathInZip); - let buffer: Buffer; - try { - buffer = await fs.readFile(filePath); - } catch (err) { - Logger.warn( - `Markdown import attachment missing in zip, skipping: ${item.pathInZip}`, - err instanceof Error ? err : undefined - ); - continue; + await ZipHelper.walk(handle.path, async (entry) => { + if (entry.isDirectory) { + return; } + // Normalize to match the bootstrap-phase pathInZip (segments rejoined + // with `/`, no leading `./` or empty segments). + const normalized = entry.fileName.split("/").filter(Boolean).join("/"); + const item = manifestByPath.get(normalized); + if (!item) { + return; + } + seen.add(item.pathInZip); + const buffer = await entry.readBuffer(maxAttachmentSize); try { await sequelize.transaction(async (transaction) => @@ -214,13 +215,21 @@ export default class MarkdownAPIImportTask extends APIImportTask { // this hook can re-encounter ids that already landed. Treat the // unique-id collision as a no-op so the import remains resumable. if (err instanceof UniqueConstraintError) { - continue; + return; } throw err; } + }); + + for (const item of scratch.manifest) { + if (!seen.has(item.pathInZip)) { + Logger.warn( + `Markdown import attachment missing in zip, skipping: ${item.pathInZip}` + ); + } } } finally { - await cleanup(); + await handle.cleanup().catch(() => {}); } } @@ -232,11 +241,28 @@ export default class MarkdownAPIImportTask extends APIImportTask { throw new Error("Markdown import is missing scratch.storageKey"); } - const { dirPath, cleanup } = await this.downloadAndExtract(storageKey); + const handle = await FileStorage.getFileHandle(storageKey); try { - const tree = await ImportHelper.toFileTree(dirPath); - if (!tree) { + // Side map of pre-loaded markdown text keyed by tree node identity. + // ZipHelper's tree carries only metadata; we capture text during the + // single walk so the downstream pass doesn't have to re-open the zip. + const markdownByNode = new Map(); + + const tree = await ZipHelper.toFileTree( + handle.path, + async (node, entry) => { + const ext = path.extname(node.name).toLowerCase(); + if (ext === ".md" || ext === ".markdown") { + const buffer = await entry.readBuffer( + DocumentValidation.maxStateLength + ); + markdownByNode.set(node, buffer.toString("utf8")); + } + } + ); + + if (tree.children.length === 0) { throw new Error("Could not find valid content in zip file"); } @@ -245,14 +271,14 @@ export default class MarkdownAPIImportTask extends APIImportTask { for (const node of tree.children) { if (node.children.length === 0) { - Logger.debug("task", `Unhandled file in zip: ${node.path}`, { + Logger.debug("task", `Unhandled file in zip: ${node.pathInZip}`, { importTaskId: importTask.id, }); continue; } if (this.isAttachmentFolder(node)) { - this.collectAttachments(node, manifest, dirPath); + this.collectAttachments(node, manifest); continue; } @@ -263,12 +289,12 @@ export default class MarkdownAPIImportTask extends APIImportTask { }; collections.push(collection); - await this.collectDocumentsAndAttachments({ + this.collectDocumentsAndAttachments({ children: node.children, collectionId: collection.id, out: collection.children, manifest, - extractionRoot: dirPath, + markdownByNode, }); } @@ -336,7 +362,7 @@ export default class MarkdownAPIImportTask extends APIImportTask { return { taskOutput: collectionOutputs, childTasksInput }; } finally { - await cleanup(); + await handle.cleanup().catch(() => {}); } } @@ -477,10 +503,10 @@ export default class MarkdownAPIImportTask extends APIImportTask { * Detects folders containing only attachments (no markdown documents). * Recursively considers nested folders; mirrors the legacy heuristic. * - * @param node FileTreeNode to inspect. + * @param node ZipTreeNode to inspect. * @returns true when the folder appears to hold only attachments. */ - private isAttachmentFolder(node: FileTreeNode): boolean { + private isAttachmentFolder(node: ZipTreeNode): boolean { if (node.children.length === 0) { return false; } @@ -501,29 +527,26 @@ export default class MarkdownAPIImportTask extends APIImportTask { /** * Recursively collects all files under an attachment-only folder into the - * manifest. `pathInZip` is stored as a path relative to the extraction - * root so it can be resolved again after the zip is re-extracted during - * the completion phase (which lands in a fresh tmp dir). + * manifest. `pathInZip` is stored verbatim so the completion phase can find + * the same entry when re-walking the archive. * - * @param node Attachment-folder FileTreeNode. + * @param node Attachment-folder ZipTreeNode. * @param manifest Manifest array to push entries into. - * @param extractionRoot Absolute path to the zip extraction root. */ private collectAttachments( - node: FileTreeNode, - manifest: MarkdownAttachmentManifestItem[], - extractionRoot: string + node: ZipTreeNode, + manifest: MarkdownAttachmentManifestItem[] ): void { for (const child of node.children) { if (child.children.length > 0) { - this.collectAttachments(child, manifest, extractionRoot); + this.collectAttachments(child, manifest); continue; } manifest.push({ id: randomUUID(), name: child.name, - pathInZip: path.relative(extractionRoot, child.path), - mimeType: mime.lookup(child.path) || "application/octet-stream", + pathInZip: child.pathInZip, + mimeType: mime.lookup(child.name) || "application/octet-stream", }); } } @@ -534,31 +557,31 @@ export default class MarkdownAPIImportTask extends APIImportTask { * entry's `children` holds its direct descendants. This is the shape the * per-page task cascade consumes. * - * @param children FileTreeNode children of the current folder. + * @param children ZipTreeNode children of the current folder. * @param collectionId Pre-assigned id of the enclosing collection. * @param parentDocumentId Optional parent document id when nested. * @param out Sibling accumulator to push discovered documents into. * @param manifest Attachment manifest accumulator. - * @returns Promise that resolves when the subtree has been processed. + * @param markdownByNode Pre-loaded markdown text keyed by tree node. */ - private async collectDocumentsAndAttachments({ + private collectDocumentsAndAttachments({ children, collectionId, parentDocumentId, out, manifest, - extractionRoot, + markdownByNode, }: { - children: FileTreeNode[]; + children: ZipTreeNode[]; collectionId: string; parentDocumentId?: string; out: DiscoveredDocument[]; manifest: MarkdownAttachmentManifestItem[]; - extractionRoot: string; - }): Promise { + markdownByNode: Map; + }): void { for (const child of children) { if (child.children.length > 0 && this.isAttachmentFolder(child)) { - this.collectAttachments(child, manifest, extractionRoot); + this.collectAttachments(child, manifest); continue; } @@ -570,16 +593,14 @@ export default class MarkdownAPIImportTask extends APIImportTask { manifest.push({ id: randomUUID(), name: child.name, - pathInZip: path.relative(extractionRoot, child.path), - mimeType: mime.lookup(child.path) || "application/octet-stream", + pathInZip: child.pathInZip, + mimeType: mime.lookup(child.name) || "application/octet-stream", }); continue; } const id = randomUUID(); - const markdownText = isFolder - ? "" - : await fs.readFile(child.path, "utf8"); + const markdownText = isFolder ? "" : (markdownByNode.get(child) ?? ""); // Folder-and-file with the same title (a "name.md" alongside a "name/" // directory) is merged onto a single document: the folder body picks up @@ -591,13 +612,13 @@ export default class MarkdownAPIImportTask extends APIImportTask { sibling.markdownText = markdownText; } if (isFolder) { - await this.collectDocumentsAndAttachments({ + this.collectDocumentsAndAttachments({ children: child.children, collectionId, parentDocumentId: sibling.id, out: sibling.children, manifest, - extractionRoot, + markdownByNode, }); } continue; @@ -606,7 +627,7 @@ export default class MarkdownAPIImportTask extends APIImportTask { const node: DiscoveredDocument = { id, title: child.title, - pathInZip: path.relative(extractionRoot, child.path), + pathInZip: child.pathInZip, collectionId, parentDocumentId, markdownText, @@ -615,58 +636,15 @@ export default class MarkdownAPIImportTask extends APIImportTask { out.push(node); if (isFolder) { - await this.collectDocumentsAndAttachments({ + this.collectDocumentsAndAttachments({ children: child.children, collectionId, parentDocumentId: id, out: node.children, manifest, - extractionRoot, + markdownByNode, }); } } } - - /** - * Downloads the zip from object storage and extracts it into a temporary - * directory. - * - * @param storageKey Storage key for the uploaded zip. - * @returns The temp dir path and a cleanup callback. Caller must invoke - * cleanup() once finished. - */ - private async downloadAndExtract(storageKey: string): Promise { - const handle = await FileStorage.getFileHandle(storageKey); - - let dirPath: string | undefined; - try { - dirPath = await new Promise((resolve, reject) => { - tmp.dir({ unsafeCleanup: true }, (err, tmpDir) => { - if (err) { - reject(err); - return; - } - resolve(tmpDir); - }); - }); - - await ZipHelper.extract(handle.path, dirPath); - - return { - dirPath, - cleanup: async () => { - await fs - .rm(dirPath!, { recursive: true, force: true }) - .catch(() => {}); - await handle.cleanup().catch(() => {}); - }, - }; - } catch (err) { - if (dirPath) { - await fs.rm(dirPath, { recursive: true, force: true }).catch(() => {}); - } - await handle.cleanup().catch(() => {}); - throw err; - } - } } diff --git a/server/utils/ZipHelper.test.ts b/server/utils/ZipHelper.test.ts index 0e0cacf1ee..c2f6bdcdf1 100644 --- a/server/utils/ZipHelper.test.ts +++ b/server/utils/ZipHelper.test.ts @@ -66,3 +66,92 @@ describe("ZipHelper.extract", () => { expect(await fs.pathExists(path.join(scratchCwd, filename))).toBe(false); }); }); + +describe("ZipHelper.toFileTree", () => { + it("builds a nested tree with normalized pathInZip", async () => { + const zipPath = await writeZip({ + "Collection/sub/page.md": "# hi", + "Collection/other.md": "other", + }); + + const root = await ZipHelper.toFileTree(zipPath); + expect(root.children).toHaveLength(1); + + const collection = root.children[0]; + expect(collection.name).toBe("Collection"); + expect(collection.pathInZip).toBe("Collection"); + expect(collection.children.map((c) => c.name).sort()).toEqual([ + "other.md", + "sub", + ]); + + const sub = collection.children.find((c) => c.name === "sub"); + expect(sub?.children[0].pathInZip).toBe("Collection/sub/page.md"); + }); + + it("normalizes `./`-prefixed entries instead of dropping them", async () => { + const zipPath = await writeZip({ + "./Collection/page.md": "body", + }); + + const root = await ZipHelper.toFileTree(zipPath); + expect(root.children).toHaveLength(1); + expect(root.children[0].name).toBe("Collection"); + expect(root.children[0].children[0].pathInZip).toBe("Collection/page.md"); + }); + + it("filters macOS metadata and dotfiles at any depth", async () => { + const zipPath = await writeZip({ + "__MACOSX/Collection/page.md": "junk", + "Collection/.DS_Store": "junk", + "Collection/page.md": "body", + }); + + const root = await ZipHelper.toFileTree(zipPath); + expect(root.children).toHaveLength(1); + expect(root.children[0].name).toBe("Collection"); + expect(root.children[0].children.map((c) => c.name)).toEqual(["page.md"]); + }); + + it("invokes onFile for each file entry with a readable handle", async () => { + const zipPath = await writeZip({ + "Collection/page.md": "hello world", + "Collection/image.png": "binary", + }); + + const seen: Record = {}; + await ZipHelper.toFileTree(zipPath, async (node, entry) => { + if (node.name.endsWith(".md")) { + const buf = await entry.readBuffer(100); + seen[node.pathInZip] = buf.toString("utf8"); + } + }); + + expect(seen).toEqual({ "Collection/page.md": "hello world" }); + }); + + it("rejects reads larger than the provided max size", async () => { + const zipPath = await writeZip({ + "Collection/page.md": "hello world", + }); + + await expect( + ZipHelper.toFileTree(zipPath, async (_node, entry) => { + await entry.readBuffer(10); + }) + ).rejects.toThrow("Collection/page.md is too large"); + }); + + it("exposes entry sizes before the entry is read", async () => { + const zipPath = await writeZip({ + "Collection/page.md": "hello world", + }); + + const sizes: Record = {}; + await ZipHelper.toFileTree(zipPath, (node, entry) => { + sizes[node.pathInZip] = entry.uncompressedSize; + }); + + expect(sizes).toEqual({ "Collection/page.md": 11 }); + }); +}); diff --git a/server/utils/ZipHelper.ts b/server/utils/ZipHelper.ts index 05a0dc46fe..8f6c373840 100644 --- a/server/utils/ZipHelper.ts +++ b/server/utils/ZipHelper.ts @@ -5,12 +5,40 @@ import tmp from "tmp"; import type { Entry } from "yauzl"; import yauzl, { validateFileName } from "yauzl"; import { bytesToHumanReadable } from "@shared/utils/files"; +import { ValidationError } from "@server/errors"; import Logger from "@server/logging/Logger"; import { trace } from "@server/logging/tracing"; -import { trimFilenameAndExt } from "./fs"; +import { deserializeFilename, trimFilenameAndExt } from "./fs"; const MAX_FILE_NAME_LENGTH = 255; +export interface ZipEntryHandle { + /** UTF-8 filename as recorded in the zip; directory entries end with `/`. */ + fileName: string; + /** Size of the uncompressed entry in bytes. */ + uncompressedSize: number; + /** True when this entry is a directory marker rather than a file. */ + isDirectory: boolean; + /** + * Read the entry's contents into memory. Safe to skip — entries the caller + * does not read are simply advanced past. + * + * @param maxSize Maximum uncompressed size to read into memory, in bytes. + */ + readBuffer(maxSize: number): Promise; +} + +export interface ZipTreeNode { + /** The file name (last path segment) including extension. */ + name: string; + /** Title derived from the file name (extension stripped, deserialized). */ + title: string; + /** Path within the zip (no leading slash, segments joined with `/`). */ + pathInZip: string; + /** Nested children — populated for directory entries. */ + children: ZipTreeNode[]; +} + @trace() export default class ZipHelper { public static defaultStreamOptions: JSZip.JSZipGeneratorOptions<"nodebuffer"> = @@ -104,6 +132,189 @@ export default class ZipHelper { }); } + /** + * Iterate through entries in a zip file without extracting it to disk. + * Entries are visited serially in archive order. `onEntry` may be async; the + * next entry is only read once the previous handler resolves. + * + * @param filePath The file path where the zip is located. + * @param onEntry Handler invoked for each entry. Skip an entry by returning + * without calling `entry.readBuffer(maxSize)`. + * @returns Promise that resolves once the archive has been fully walked. + */ + public static walk( + filePath: string, + onEntry: (entry: ZipEntryHandle) => Promise | void + ): Promise { + return new Promise((resolve, reject) => { + yauzl.open( + filePath, + { + lazyEntries: true, + autoClose: true, + decodeStrings: false, + }, + function (err, zipfile) { + if (err) { + return reject(err); + } + + let settled = false; + const fail = (error: Error) => { + if (settled) { + return; + } + settled = true; + zipfile.close(); + reject(error); + }; + + zipfile.on("entry", (entry: Entry) => { + const fileName = Buffer.from(entry.fileName).toString("utf8"); + + if (validateFileName(fileName)) { + Logger.warn("Invalid zip entry", { fileName }); + zipfile.readEntry(); + return; + } + + const handle: ZipEntryHandle = { + fileName, + uncompressedSize: entry.uncompressedSize, + isDirectory: /\/$/.test(fileName), + readBuffer: (maxSize) => + new Promise((res, rej) => { + if (entry.uncompressedSize > maxSize) { + return rej(ZipHelper.entryTooLargeError(fileName, maxSize)); + } + + zipfile.openReadStream(entry, (rErr, readStream) => { + if (rErr) { + return rej(rErr); + } + const chunks: Buffer[] = []; + let bytesRead = 0; + readStream.on("data", (chunk: Buffer) => { + bytesRead += chunk.length; + if (bytesRead > maxSize) { + readStream.destroy( + ZipHelper.entryTooLargeError(fileName, maxSize) + ); + return; + } + chunks.push(chunk); + }); + readStream.on("end", () => res(Buffer.concat(chunks))); + readStream.on("error", rej); + }); + }), + }; + + Promise.resolve() + .then(() => onEntry(handle)) + .then(() => { + if (!settled) { + zipfile.readEntry(); + } + }) + .catch(fail); + }); + + zipfile.on("close", () => { + if (!settled) { + settled = true; + resolve(); + } + }); + zipfile.on("error", (error) => fail(error)); + zipfile.readEntry(); + } + ); + }); + } + + /** + * Walk a zip file once and build a tree of its entries without extracting + * to disk. macOS metadata directories (`__MACOSX`) and dotfiles are + * filtered out at any path segment. + * + * The optional `onFile` callback fires once per file entry as it is + * encountered, with both the materialized tree node and a handle to the + * raw entry. Callers that need to pre-load contents (e.g. small text + * files) can call `entry.readBuffer(maxSize)`; callers that only need the tree + * structure can omit the callback entirely. + * + * @param filePath Local filesystem path to the zip. + * @param onFile Optional per-file hook; not called for directory entries. + * @returns A synthetic root node whose `children` are the zip's top-level + * entries. + */ + public static async toFileTree( + filePath: string, + onFile?: (node: ZipTreeNode, entry: ZipEntryHandle) => Promise | void + ): Promise { + const root: ZipTreeNode = { + name: "", + title: "", + pathInZip: "", + children: [], + }; + + const isFiltered = (segment: string) => + segment === "__MACOSX" || segment.startsWith("."); + + const nodesByPath = new Map(); + + const resolveNode = (entryName: string): ZipTreeNode | null => { + // Drop empty segments and the path-no-op `.` (e.g. entries written as + // `./Collection/page.md`). `..` is preserved so the dotfile filter + // below rejects it — we never resolve path traversal in zip entries. + const segments = entryName + .split("/") + .filter((s) => s !== "" && s !== "."); + if (segments.length === 0) { + return null; + } + + let current = root; + let pathSoFar = ""; + for (let i = 0; i < segments.length; i++) { + const segment = segments[i]; + if (isFiltered(segment)) { + return null; + } + + pathSoFar = pathSoFar ? `${pathSoFar}/${segment}` : segment; + let next = nodesByPath.get(pathSoFar); + if (!next) { + next = { + name: segment, + title: deserializeFilename(path.parse(segment).name), + pathInZip: pathSoFar, + children: [], + }; + current.children.push(next); + nodesByPath.set(pathSoFar, next); + } + current = next; + } + + return current; + }; + + await this.walk(filePath, async (entry) => { + const node = resolveNode(entry.fileName); + if (!node || entry.isDirectory) { + return; + } + if (onFile) { + await onFile(node, entry); + } + }); + + return root; + } + /** * Write a zip file to a disk location * @@ -231,4 +442,12 @@ export default class ZipHelper { ); }); } + + private static entryTooLargeError(fileName: string, maxSize: number): Error { + return ValidationError( + `${fileName} is too large - the maximum size is ${bytesToHumanReadable( + maxSize + )}` + ); + } }