outline/server/queues/tasks/ImportMarkdownZipTask.ts

import path from "node:path";
import fs from "fs-extra";
import { escapeRegExp } from "es-toolkit/compat";
import mime from "mime-types";
import { randomUUID } from "node:crypto";
import documentImporter from "@server/commands/documentImporter";
import { createContext } from "@server/context";
import Logger from "@server/logging/Logger";
import type { FileOperation } from "@server/models";
import { User } from "@server/models";
import { Buckets } from "@server/models/helpers/AttachmentHelper";
import { sequelize } from "@server/storage/database";
import type { FileTreeNode } from "@server/utils/ImportHelper";
import ImportHelper from "@server/utils/ImportHelper";
import type { StructuredImportData } from "./ImportTask";
import ImportTask from "./ImportTask";

export default class ImportMarkdownZipTask extends ImportTask {
  public async parseData(
    dirPath: string,
    fileOperation: FileOperation
  ): Promise<StructuredImportData> {
    const tree = await ImportHelper.toFileTree(dirPath);
    if (!tree) {
      throw new Error("Could not find valid content in zip file");
    }

    return this.parseFileTree(fileOperation, tree.children);
  }

  /**
   * Check if a folder contains only attachment files (no markdown documents).
   *
   * @param node The file tree node to check
   * @returns true if the folder contains only non-markdown files
   */
  private isAttachmentFolder(node: FileTreeNode): boolean {
    if (node.children.length === 0) {
      return false;
    }

    if (node.title.toLowerCase() === "attachments") {
      return true;
    }

    return node.children.every((child) => {
      // If child has children, it's a folder - recurse to check its contents
      if (child.children.length > 0) {
        return this.isAttachmentFolder(child);
      }

      // Child has no children - could be a file or empty folder
      const ext = path.extname(child.name).toLowerCase();

      // If no extension, it's likely an empty folder, not a file.
      // Be conservative and don't treat it as an attachment.
      if (!ext) {
        return false;
      }

      // It's a file with an extension - check if it's NOT markdown
      return ext !== ".md" && ext !== ".markdown";
    });
  }

  /**
   * Recursively process all files in a folder as attachments.
   *
   * @param node The file tree node to process
   * @param output The structured import data to add attachments to
   */
  private parseAttachmentFolder(
    node: FileTreeNode,
    output: StructuredImportData
  ): void {
    for (const child of node.children) {
      if (child.children.length > 0) {
        this.parseAttachmentFolder(child, output);
      } else {
        const id = randomUUID();
        output.attachments.push({
          id,
          name: child.name,
          path: child.path,
          mimeType: mime.lookup(child.path) || "application/octet-stream",
          buffer: () => fs.readFile(child.path),
        });
      }
    }
  }

  /**
   * Converts the file structure from zipAsFileTree into documents,
   * collections, and attachments.
   *
   * @param fileOperation The file operation
   * @param tree An array of FileTreeNode representing root files in the zip
   * @returns A StructuredImportData object
   */
  private async parseFileTree(
    fileOperation: FileOperation,
    tree: FileTreeNode[]
  ): Promise<StructuredImportData> {
    const user = await User.findByPk(fileOperation.userId, {
      rejectOnEmpty: true,
    });
    const output: StructuredImportData = {
      collections: [],
      documents: [],
      attachments: [],
    };

    const docPathToIdMap = new Map<string, string>();

    const parseNodeChildren = async (
      children: FileTreeNode[],
      collectionId: string,
      parentDocumentId?: string
    ): Promise<void> => {
      for (const child of children) {
        // special case for folders of attachments - detect by content
        if (child.children.length > 0 && this.isAttachmentFolder(child)) {
          this.parseAttachmentFolder(child, output);
          continue;
        }

        const id = randomUUID();

        const { title, icon, text } = await sequelize.transaction(
          async (transaction) =>
            documentImporter({
              mimeType: "text/markdown",
              fileName: child.name,
              content:
                child.children.length > 0
                  ? ""
                  : await fs.readFile(child.path, "utf8"),
              user,
              ctx: createContext({ user, transaction }),
            })
        );

        const existingDocumentIndex = output.documents.findIndex(
          (doc) =>
            doc.title === title &&
            doc.collectionId === collectionId &&
            doc.parentDocumentId === parentDocumentId
        );

        const existingDocument = output.documents[existingDocumentIndex];

        // When there is a file and a folder with the same name this handles
        // the case by combining the two into one document with nested children
        if (existingDocument) {
          docPathToIdMap.set(child.path, existingDocument.id);

          if (existingDocument.text === "") {
            output.documents[existingDocumentIndex].text = text;
          }

          await parseNodeChildren(
            child.children,
            collectionId,
            existingDocument.id
          );
        } else {
          docPathToIdMap.set(child.path, id);

          output.documents.push({
            id,
            title,
            icon,
            text,
            collectionId,
            parentDocumentId,
            path: child.path,
            mimeType: "text/markdown",
          });

          await parseNodeChildren(child.children, collectionId, id);
        }
      }
    };

    // All nodes in the root level should be collections
    for (const node of tree) {
      if (node.children.length > 0) {
        // Check if this is an attachments-only folder at root level
        if (this.isAttachmentFolder(node)) {
          this.parseAttachmentFolder(node, output);
          continue;
        }

        const collectionId = randomUUID();
        output.collections.push({
          id: collectionId,
          name: node.title,
        });
        await parseNodeChildren(node.children, collectionId);
      } else {
        Logger.debug("task", `Unhandled file in zip: ${node.path}`, {
          fileOperationId: fileOperation.id,
        });
      }
    }

    for (const document of output.documents) {
      // Check all of the attachments we've created against urls in the text
      // and replace them out with attachment redirect urls before continuing.
      for (const attachment of output.attachments) {
        const encodedPath = encodeURI(attachment.path);
        const attachmentFileName = path.basename(attachment.path);
        const reference = `<<${attachment.id}>>`;

        // Pull the collection and subdirectory out of the path name, upload
        // folders in an export are relative to the document itself.
        // Support both legacy bucket names (uploads/public) and generic attachment folders.
        let normalizedAttachmentPath = encodedPath
          .replace(
            new RegExp(`(.*)/${Buckets.uploads}/`),
            `${Buckets.uploads}/`
          )
          .replace(new RegExp(`(.*)/${Buckets.public}/`), `${Buckets.public}/`);

        // Also try normalizing to just the folder containing the attachment
        // This handles arbitrary folder names like "attachments/"
        const attachmentDir = path.basename(path.dirname(attachment.path));
        const genericNormalizedPath = `${attachmentDir}/${encodeURI(attachmentFileName)}`;

        document.text = document.text
          .replace(new RegExp(escapeRegExp(encodedPath), "g"), reference)
          .replace(
            new RegExp(`\\.?/?${escapeRegExp(normalizedAttachmentPath)}`, "g"),
            reference
          )
          .replace(
            new RegExp(`\\.?/?${escapeRegExp(genericNormalizedPath)}`, "g"),
            reference
          );

        // Handle markdown links that reference attachments via a path rooted
        // at an "attachments" folder, optionally prefixed with "./", e.g.
        // ./attachments/foo.png or ./attachments/sub/foo.png.
        const segments = attachment.path.split(path.sep);
        const attachmentsIdx = segments.findIndex(
          (seg) => seg.toLowerCase() === "attachments"
        );
        if (attachmentsIdx >= 0) {
          const relFromAttachments = segments.slice(attachmentsIdx).join("/");
          document.text = document.text.replace(
            new RegExp(
              `\\.?/?${escapeRegExp(encodeURI(relFromAttachments))}`,
              "g"
            ),
            reference
          );
        }
      }

      const basePath = path.dirname(document.path);

      // check internal document links in the text and replace them with placeholders.
      // When persisting, the placeholders will be replaced with the right urls.
      const internalLinks = [
        ...document.text.matchAll(/\[[^\]]+\]\(([^)]+\.md)\)/g),
      ];

      internalLinks.forEach((match) => {
        const referredDocPath = match[1];
        const normalizedDocPath = decodeURI(
          path.normalize(`${basePath}/${referredDocPath}`)
        );

        const referredDocId = docPathToIdMap.get(normalizedDocPath);
        if (referredDocId) {
          document.text = document.text.replace(
            referredDocPath,
            `<<${referredDocId}>>`
          );
        }
      });
    }

    return output;
  }
}