chore: Refactor Markdown importer to use new import pipeline (#12361)

* chore: Refactor Markdown importer to use new import pipeline

---------

Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Tom Moor
2026-05-16 14:10:15 -04:00
committed by GitHub
parent 170f59c6ba
commit 82d7041b6b
20 changed files with 1224 additions and 508 deletions
@@ -6,7 +6,12 @@ import { useTranslation } from "react-i18next";
import { toast } from "sonner";
import styled from "styled-components";
import { s } from "@shared/styles";
import { AttachmentPreset, CollectionPermission } from "@shared/types";
import {
AttachmentPreset,
CollectionPermission,
FileOperationFormat,
IntegrationService,
} from "@shared/types";
import { bytesToHumanReadable } from "@shared/utils/files";
import Button from "~/components/Button";
import Flex from "~/components/Flex";
@@ -27,7 +32,7 @@ type Props = {
function DropToImport({ disabled, onSubmit, children, format }: Props) {
const { t } = useTranslation();
const { collections } = useStores();
const { collections, imports } = useStores();
const [file, setFile] = useState<File | null>(null);
const [isImporting, setImporting] = useState(false);
const [permission, setPermission] = useState<CollectionPermission | null>(
@@ -53,7 +58,19 @@ function DropToImport({ disabled, onSubmit, children, format }: Props) {
name: file.name,
preset: AttachmentPreset.WorkspaceImport,
});
await collections.import(attachment.id, { format, permission });
if (format === FileOperationFormat.MarkdownZip) {
await imports.create(
{ service: IntegrationService.Markdown },
{
attachmentId: attachment.id,
permission: permission ?? undefined,
}
);
} else {
await collections.import(attachment.id, { format, permission });
}
onSubmit();
toast.message(file.name, {
description: t(
@@ -30,6 +30,10 @@ export class NotionImportsProcessor extends ImportsProcessor<IntegrationService.
importModel: Import<IntegrationService.Notion>,
transaction: Transaction
): Promise<NotionImportTaskInput> {
if (!importModel.integrationId) {
throw new Error("Notion import is missing integrationId");
}
const integration = await Integration.scope("withAuthentication").findByPk(
importModel.integrationId,
{ rejectOnEmpty: true }
@@ -29,15 +29,19 @@ export default class NotionAPIImportTask extends APIImportTask<IntegrationServic
];
/**
* Process the Notion import task.
* Process a Notion page-phase import task.
* This fetches data from Notion and converts it to task output.
*
* @param importTask ImportTask model to process.
* @returns Promise with output that resolves once processing has completed.
*/
protected async process(
protected async processPage(
importTask: ImportTask<IntegrationService.Notion>
): Promise<ProcessOutput<IntegrationService.Notion>> {
if (!importTask.import.integrationId) {
throw new Error("Notion import is missing integrationId");
}
const integration = await Integration.scope("withAuthentication").findByPk(
importTask.import.integrationId,
{ rejectOnEmpty: true }
@@ -47,7 +51,7 @@ export default class NotionAPIImportTask extends APIImportTask<IntegrationServic
const parsedPages: (ParsePageOutput | null)[] = [];
for (const item of importTask.input) {
parsedPages.push(await this.processPage({ item, client }));
parsedPages.push(await this.parsePage({ item, client }));
}
// Filter out any null results (from pages/databases that couldn't be accessed)
@@ -56,7 +60,7 @@ export default class NotionAPIImportTask extends APIImportTask<IntegrationServic
const taskOutput: ImportTaskOutput = validParsedPages.map((parsedPage) => ({
externalId: parsedPage.externalId,
title: parsedPage.title,
emoji: parsedPage.emoji,
icon: parsedPage.icon,
content: parsedPage.content,
author: parsedPage.author,
createdAt: parsedPage.createdAt,
@@ -96,7 +100,7 @@ export default class NotionAPIImportTask extends APIImportTask<IntegrationServic
* @param client Notion client.
* @returns Promise of parsed page output that resolves when the task is scheduled.
*/
private async processPage({
private async parsePage({
item,
client,
}: {
@@ -112,13 +116,14 @@ export default class NotionAPIImportTask extends APIImportTask<IntegrationServic
try {
// Convert Notion database to an empty page with "pages in database" as its children.
if (item.type === PageType.Database) {
const { pages, ...databaseInfo } = await client.fetchDatabase(
const { pages, emoji, ...databaseInfo } = await client.fetchDatabase(
item.externalId,
{ titleMaxLength }
);
return {
...databaseInfo,
icon: emoji,
externalId: item.externalId,
content: ProsemirrorHelper.getEmptyDocument() as ProsemirrorDoc,
collectionExternalId,
@@ -129,12 +134,14 @@ export default class NotionAPIImportTask extends APIImportTask<IntegrationServic
};
}
const { blocks, ...pageInfo } = await client.fetchPage(item.externalId, {
titleMaxLength,
});
const { blocks, emoji, ...pageInfo } = await client.fetchPage(
item.externalId,
{ titleMaxLength }
);
return {
...pageInfo,
icon: emoji,
externalId: item.externalId,
content: NotionConverter.page({ children: blocks } as NotionPage),
collectionExternalId,
@@ -0,0 +1,16 @@
"use strict";
/** @type {import('sequelize-cli').Migration} */
module.exports = {
async up(queryInterface) {
await queryInterface.sequelize.query(
`ALTER TABLE "imports" ALTER COLUMN "integrationId" DROP NOT NULL`
);
},
async down(queryInterface) {
await queryInterface.sequelize.query(
`ALTER TABLE "imports" ALTER COLUMN "integrationId" SET NOT NULL`
);
},
};
@@ -0,0 +1,21 @@
"use strict";
/** @type {import('sequelize-cli').Migration} */
module.exports = {
async up(queryInterface, Sequelize) {
await queryInterface.addColumn("import_tasks", "phase", {
type: Sequelize.STRING,
allowNull: false,
defaultValue: "page",
});
await queryInterface.addColumn("imports", "scratch", {
type: Sequelize.JSONB,
allowNull: true,
});
},
async down(queryInterface) {
await queryInterface.removeColumn("imports", "scratch");
await queryInterface.removeColumn("import_tasks", "phase");
},
};
+9 -3
View File
@@ -1,5 +1,6 @@
import type { InferAttributes, InferCreationAttributes } from "sequelize";
import {
AllowNull,
BelongsTo,
Column,
DataType,
@@ -10,7 +11,7 @@ import {
IsNumeric,
Table,
} from "sequelize-typescript";
import { type ImportInput } from "@shared/schema";
import { type ImportInput, type ImportScratch } from "@shared/schema";
import { ImportableIntegrationService, ImportState } from "@shared/types";
import { ImportValidation } from "@shared/validations";
import Integration from "./Integration";
@@ -55,6 +56,10 @@ class Import<T extends ImportableIntegrationService> extends ParanoidModel<
@Column(DataType.JSONB)
input: ImportInput<T>;
@AllowNull
@Column(DataType.JSONB)
scratch: ImportScratch<T> | null;
@IsNumeric
@Default(0)
@Column(DataType.INTEGER)
@@ -66,11 +71,12 @@ class Import<T extends ImportableIntegrationService> extends ParanoidModel<
// associations
@BelongsTo(() => Integration, "integrationId")
integration: Integration;
integration: Integration | null;
@AllowNull
@ForeignKey(() => Integration)
@Column(DataType.UUID)
integrationId: string;
integrationId: string | null;
@BelongsTo(() => User, "createdById")
createdBy: User;
+5 -1
View File
@@ -11,7 +11,7 @@ import {
import type { ImportTaskOutput } from "@shared/schema";
import { type ImportTaskInput } from "@shared/schema";
import type { ImportableIntegrationService } from "@shared/types";
import { ImportTaskState } from "@shared/types";
import { ImportTaskPhase, ImportTaskState } from "@shared/types";
import Import from "./Import";
import IdModel from "./base/IdModel";
import Fix from "./decorators/Fix";
@@ -40,6 +40,10 @@ class ImportTask<T extends ImportableIntegrationService> extends IdModel<
@Column(DataType.STRING)
state: ImportTaskState;
@IsIn([Object.values(ImportTaskPhase)])
@Column(DataType.STRING)
phase: ImportTaskPhase;
@Column(DataType.JSONB)
input: ImportTaskInput<T>;
@@ -5,7 +5,6 @@ import ExportHTMLZipTask from "../tasks/ExportHTMLZipTask";
import ExportJSONTask from "../tasks/ExportJSONTask";
import ExportMarkdownZipTask from "../tasks/ExportMarkdownZipTask";
import ImportJSONTask from "../tasks/ImportJSONTask";
import ImportMarkdownZipTask from "../tasks/ImportMarkdownZipTask";
import BaseProcessor from "./BaseProcessor";
export default class FileOperationCreatedProcessor extends BaseProcessor {
@@ -19,14 +18,11 @@ export default class FileOperationCreatedProcessor extends BaseProcessor {
}
);
// map file operation type and format to the appropriate task
// map file operation type and format to the appropriate task. Markdown
// zip imports flow through the API-import pipeline (`imports.create` →
// MarkdownAPIImportTask) and never reach this dispatcher.
if (fileOperation.type === FileOperationType.Import) {
switch (fileOperation.format) {
case FileOperationFormat.MarkdownZip:
await new ImportMarkdownZipTask().schedule({
fileOperationId: event.modelId,
});
break;
case FileOperationFormat.JSON:
await new ImportJSONTask().schedule({
fileOperationId: event.modelId,
+50 -20
View File
@@ -5,14 +5,24 @@ import type { CreateOptions, CreationAttributes, Transaction } from "sequelize";
import { UniqueConstraintError } from "sequelize";
import { randomUUID } from "node:crypto";
import { randomElement } from "@shared/random";
import type { ImportInput, ImportTaskInput } from "@shared/schema";
import type {
BaseImportInput,
BaseImportTaskInput,
ImportInput,
ImportTaskInput,
} from "@shared/schema";
import type {
ImportableIntegrationService,
ProsemirrorData,
ProsemirrorDoc,
SourceMetadata,
} from "@shared/types";
import { ImportState, ImportTaskState, MentionType } from "@shared/types";
import {
ImportState,
ImportTaskPhase,
ImportTaskState,
MentionType,
} from "@shared/types";
import { colorPalette } from "@shared/utils/collections";
import { CollectionValidation } from "@shared/validations";
import { createContext } from "@server/context";
@@ -118,22 +128,26 @@ export default abstract class ImportsProcessor<
}
const tasksInput = await this.buildTasksInput(importModel, transaction);
const phase = this.getInitialPhase();
const importTasks = await Promise.all(
chunk(tasksInput, PagePerImportTask).map((input) => {
const attrs = {
state: ImportTaskState.Created,
input,
importId: importModel.id,
} as ImportTaskCreationAttributes<T>;
chunk(tasksInput as BaseImportTaskInput, PagePerImportTask).map(
(input) => {
const attrs = {
state: ImportTaskState.Created,
phase,
input,
importId: importModel.id,
} as ImportTaskCreationAttributes<T>;
return ImportTask.create<
ImportTask<T>,
CreateOptions<ImportTaskAttributes<T>>
>(attrs as unknown as CreationAttributes<ImportTask<T>>, {
transaction,
});
})
return ImportTask.create<
ImportTask<T>,
CreateOptions<ImportTaskAttributes<T>>
>(attrs as unknown as CreationAttributes<ImportTask<T>>, {
transaction,
});
}
)
);
importModel.state = ImportState.InProgress;
@@ -271,8 +285,12 @@ export default abstract class ImportsProcessor<
const createdCollections: Collection[] = [];
// External id to internal model id.
const idMap: Record<string, string> = {};
// These will be imported as collections.
const importInput = keyBy(importModel.input, "externalId");
// These will be imported as collections. Widened to the base input shape
// because the abstract class has no narrowed view of T.
const importInput = keyBy(
importModel.input as BaseImportInput,
"externalId"
);
const ctx = createContext({ user: importModel.createdBy, transaction });
const firstCollection = await Collection.findFirstCollectionForUser(
@@ -361,8 +379,8 @@ export default abstract class ImportsProcessor<
const collection = Collection.build({
id: internalId,
name: output.title,
icon: output.emoji ?? "collection",
color: output.emoji ? undefined : randomElement(colorPalette),
icon: output.icon ?? "collection",
color: output.icon ? undefined : randomElement(colorPalette),
content: transformedContent,
description: truncate(description, {
length: CollectionValidation.maxDescriptionLength,
@@ -403,7 +421,7 @@ export default abstract class ImportsProcessor<
const defaults = {
title: output.title,
icon: output.emoji,
icon: output.icon,
content: transformedContent,
text: await DocumentHelper.toMarkdown(transformedContent, {
includeTitle: false,
@@ -602,6 +620,18 @@ export default abstract class ImportsProcessor<
*/
protected abstract canProcess(importModel: Import<T>): boolean;
/**
* Phase assigned to the initial ImportTask rows created from
* `buildTasksInput`. Sources that begin with a bootstrap step (e.g.
* Markdown zip extraction) override this to return `Bootstrap`. Sources
* that fan out directly into page work (e.g. Notion) leave the default.
*
* @returns Phase for the first wave of ImportTask rows.
*/
protected getInitialPhase(): ImportTaskPhase {
return ImportTaskPhase.Page;
}
/**
* Build task inputs which will be used for `APIImportTask`s.
*
@@ -0,0 +1,37 @@
import type { Transaction } from "sequelize";
import type { ImportTaskInput } from "@shared/schema";
import { ImportTaskPhase, IntegrationService } from "@shared/types";
import type { Import, ImportTask } from "@server/models";
import MarkdownAPIImportTask from "../tasks/MarkdownAPIImportTask";
import ImportsProcessor from "./ImportsProcessor";
export default class MarkdownImportsProcessor extends ImportsProcessor<IntegrationService.Markdown> {
protected canProcess(
importModel: Import<IntegrationService.Markdown>
): boolean {
return importModel.service === IntegrationService.Markdown;
}
protected getInitialPhase(): ImportTaskPhase {
return ImportTaskPhase.Bootstrap;
}
protected async buildTasksInput(
importModel: Import<IntegrationService.Markdown>,
_transaction: Transaction
): Promise<ImportTaskInput<IntegrationService.Markdown>> {
if (!importModel.scratch?.storageKey) {
throw new Error(
"Markdown import is missing scratch.storageKey for the bootstrap phase"
);
}
return [{ externalId: importModel.input[0].externalId }];
}
protected async scheduleTask(
importTask: ImportTask<IntegrationService.Markdown>
): Promise<void> {
await new MarkdownAPIImportTask().schedule({ importTaskId: importTask.id });
}
}
+100 -27
View File
@@ -4,13 +4,22 @@ import { Fragment, Node } from "prosemirror-model";
import type { WhereOptions } from "sequelize";
import { Transaction } from "sequelize";
import { randomUUID } from "node:crypto";
import type { ImportTaskInput, ImportTaskOutput } from "@shared/schema";
import type {
BaseImportTaskInput,
ImportTaskInput,
ImportTaskOutput,
} from "@shared/schema";
import type {
ImportableIntegrationService,
ProsemirrorData,
ProsemirrorDoc,
} from "@shared/types";
import { AttachmentPreset, ImportState, ImportTaskState } from "@shared/types";
import {
AttachmentPreset,
ImportState,
ImportTaskPhase,
ImportTaskState,
} from "@shared/types";
import { createContext } from "@server/context";
import { schema } from "@server/editor";
import Logger from "@server/logging/Logger";
@@ -134,31 +143,39 @@ export default abstract class APIImportTask<
* @returns Promise that resolves once processing has completed.
*/
private async onProcess(importTask: ImportTask<T>) {
const { taskOutput, childTasksInput } = await this.process(importTask);
const { taskOutput, childTasksInput } =
importTask.phase === ImportTaskPhase.Bootstrap
? await this.processBootstrap(importTask)
: await this.processPage(importTask);
const taskOutputWithReplacements = await Promise.all(
taskOutput.map(async (item) => ({
...item,
content: await this.uploadAttachments({
doc: item.content,
externalId: item.externalId,
createdBy: importTask.import.createdBy,
}),
}))
);
const taskOutputWithReplacements = this.shouldUploadAttachmentsPerPage()
? await Promise.all(
taskOutput.map(async (item) => ({
...item,
content: await this.uploadAttachments({
doc: item.content,
externalId: item.externalId,
createdBy: importTask.import.createdBy,
}),
}))
)
: taskOutput;
await sequelize.transaction(async (transaction) => {
await Promise.all(
chunk(childTasksInput, PagePerImportTask).map(async (input) => {
await ImportTask.create(
{
state: ImportTaskState.Created,
input,
importId: importTask.importId,
},
{ transaction }
);
})
chunk(childTasksInput as BaseImportTaskInput, PagePerImportTask).map(
async (input) => {
await ImportTask.create(
{
state: ImportTaskState.Created,
phase: ImportTaskPhase.Page,
input: input as ImportTaskInput<T>,
importId: importTask.importId,
},
{ transaction }
);
}
)
);
importTask.output = taskOutputWithReplacements;
@@ -206,10 +223,16 @@ export default abstract class APIImportTask<
return await this.scheduleNextTask(nextImportTask);
}
// All tasks for this import have been processed.
// All tasks for this import have been processed. Run the post-completion
// hook before flipping state so subclasses can perform work that must
// happen before "imports.processed" downstream handlers fire.
await this.onAllTasksCompleted(importTask);
await sequelize.transaction(async (transaction) => {
const associatedImport = importTask.import;
associatedImport.state = ImportState.Processed;
// Release any cross-phase scratch state — the import is done with it.
associatedImport.scratch = null;
await associatedImport.saveWithCtx(
createContext({
user: associatedImport.createdBy,
@@ -222,13 +245,63 @@ export default abstract class APIImportTask<
}
/**
* Process the import task.
* This fetches data from external source and converts it to task output.
* Whether the base class should create Attachment rows and upload S3 blobs
* per page during `onProcess`. Defaults to `true` for sources whose
* attachments are addressable by per-task URLs (e.g. Notion). Sources where
* attachments are shared across pages or live in a single archive may
* override this and handle attachment persistence in `onAllTasksCompleted`.
*
* @returns true to enable the per-page attachment upload step.
*/
protected shouldUploadAttachmentsPerPage(): boolean {
return true;
}
/**
* Hook invoked after the final import task has been processed but before the
* associated `Import` state transitions to `Processed`. Subclasses can
* override to perform cross-task finalization (e.g. uploading shared
* attachments) that must happen before the persistence pass.
*
* @param lastImportTask The most recently completed ImportTask for the import.
* @returns Promise that resolves when finalization is complete.
*/
protected async onAllTasksCompleted(
// oxlint-disable-next-line @typescript-eslint/no-unused-vars
lastImportTask: ImportTask<T>
): Promise<void> {
return;
}
/**
* Bootstrap phase. Runs once per import on a worker that owns the source
* artifact (e.g. extracts a zip, walks the file tree, schedules child page
* tasks). Subclasses without a bootstrap step leave this unimplemented; the
* base only invokes it when an `ImportTask` is created with
* `phase === ImportTaskPhase.Bootstrap`.
*
* @param importTask ImportTask model to process.
* @returns Promise with output that resolves once processing has completed.
*/
protected abstract process(
protected processBootstrap(
// oxlint-disable-next-line @typescript-eslint/no-unused-vars
importTask: ImportTask<T>
): Promise<ProcessOutput<T>> {
throw new Error(
`${this.constructor.name} does not implement processBootstrap()`
);
}
/**
* Page phase. Runs for every `ImportTask` row with
* `phase === ImportTaskPhase.Page`, transforming a batch of source pages
* into ProseMirror output and optionally cascading descendants as the next
* wave of child tasks.
*
* @param importTask ImportTask model to process.
* @returns Promise with output that resolves once processing has completed.
*/
protected abstract processPage(
importTask: ImportTask<T>
): Promise<ProcessOutput<T>>;
@@ -1,139 +0,0 @@
/* oxlint-disable @typescript-eslint/no-empty-function */
import path from "node:path";
import { FileOperation } from "@server/models";
import { buildFileOperation } from "@server/test/factories";
import ImportMarkdownZipTask from "./ImportMarkdownZipTask";
describe("ImportMarkdownZipTask", () => {
it("should import the documents, attachments", async () => {
const fileOperation = await buildFileOperation();
Object.defineProperty(fileOperation, "handle", {
get() {
return {
path: path.resolve(
__dirname,
"..",
"..",
"test",
"fixtures",
"outline-markdown.zip"
),
cleanup: async () => {},
};
},
});
vi.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
const props = {
fileOperationId: fileOperation.id,
};
const task = new ImportMarkdownZipTask();
const response = await task.perform(props);
expect(response.collections.size).toEqual(1);
expect(response.documents.size).toEqual(8);
expect(response.attachments.size).toEqual(6);
}, 10000);
it("should import the documents, public attachments", async () => {
const fileOperation = await buildFileOperation();
Object.defineProperty(fileOperation, "handle", {
get() {
return {
path: path.resolve(
__dirname,
"..",
"..",
"test",
"fixtures",
"outline-markdown-public.zip"
),
cleanup: async () => {},
};
},
});
vi.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
const props = {
fileOperationId: fileOperation.id,
};
const task = new ImportMarkdownZipTask();
const response = await task.perform(props);
expect(response.collections.size).toEqual(1);
expect(response.documents.size).toEqual(2);
expect(response.attachments.size).toEqual(1);
}, 10000);
it("should throw an error with corrupt zip", async () => {
const fileOperation = await buildFileOperation();
Object.defineProperty(fileOperation, "handle", {
get() {
return {
path: path.resolve(
__dirname,
"..",
"..",
"test",
"fixtures",
"corrupt.zip"
),
cleanup: async () => {},
};
},
});
vi.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
const props = {
fileOperationId: fileOperation.id,
};
let error;
try {
const task = new ImportMarkdownZipTask();
await task.perform(props);
} catch (err) {
error = err;
}
expect(error && error.message).toBeTruthy();
});
it("should throw an error with empty collection in zip", async () => {
const fileOperation = await buildFileOperation();
Object.defineProperty(fileOperation, "handle", {
get() {
return {
path: path.resolve(
__dirname,
"..",
"..",
"test",
"fixtures",
"empty.zip"
),
cleanup: async () => {},
};
},
});
vi.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
const props = {
fileOperationId: fileOperation.id,
};
let error;
try {
const task = new ImportMarkdownZipTask();
await task.perform(props);
} catch (err) {
error = err;
}
expect(error && error.message).toContain(
"Uploaded file does not contain any valid collections"
);
});
});
@@ -1,286 +0,0 @@
import path from "node:path";
import fs from "fs-extra";
import { escapeRegExp } from "es-toolkit/compat";
import mime from "mime-types";
import { randomUUID } from "node:crypto";
import documentImporter from "@server/commands/documentImporter";
import { createContext } from "@server/context";
import Logger from "@server/logging/Logger";
import type { FileOperation } from "@server/models";
import { User } from "@server/models";
import { Buckets } from "@server/models/helpers/AttachmentHelper";
import { sequelize } from "@server/storage/database";
import type { FileTreeNode } from "@server/utils/ImportHelper";
import ImportHelper from "@server/utils/ImportHelper";
import type { StructuredImportData } from "./ImportTask";
import ImportTask from "./ImportTask";
export default class ImportMarkdownZipTask extends ImportTask {
public async parseData(
dirPath: string,
fileOperation: FileOperation
): Promise<StructuredImportData> {
const tree = await ImportHelper.toFileTree(dirPath);
if (!tree) {
throw new Error("Could not find valid content in zip file");
}
return this.parseFileTree(fileOperation, tree.children);
}
/**
* Check if a folder contains only attachment files (no markdown documents).
*
* @param node The file tree node to check
* @returns true if the folder contains only non-markdown files
*/
private isAttachmentFolder(node: FileTreeNode): boolean {
if (node.children.length === 0) {
return false;
}
if (node.title.toLowerCase() === "attachments") {
return true;
}
return node.children.every((child) => {
// If child has children, it's a folder - recurse to check its contents
if (child.children.length > 0) {
return this.isAttachmentFolder(child);
}
// Child has no children - could be a file or empty folder
const ext = path.extname(child.name).toLowerCase();
// If no extension, it's likely an empty folder, not a file.
// Be conservative and don't treat it as an attachment.
if (!ext) {
return false;
}
// It's a file with an extension - check if it's NOT markdown
return ext !== ".md" && ext !== ".markdown";
});
}
/**
* Recursively process all files in a folder as attachments.
*
* @param node The file tree node to process
* @param output The structured import data to add attachments to
*/
private parseAttachmentFolder(
node: FileTreeNode,
output: StructuredImportData
): void {
for (const child of node.children) {
if (child.children.length > 0) {
this.parseAttachmentFolder(child, output);
} else {
const id = randomUUID();
output.attachments.push({
id,
name: child.name,
path: child.path,
mimeType: mime.lookup(child.path) || "application/octet-stream",
buffer: () => fs.readFile(child.path),
});
}
}
}
/**
* Converts the file structure from zipAsFileTree into documents,
* collections, and attachments.
*
* @param fileOperation The file operation
* @param tree An array of FileTreeNode representing root files in the zip
* @returns A StructuredImportData object
*/
private async parseFileTree(
fileOperation: FileOperation,
tree: FileTreeNode[]
): Promise<StructuredImportData> {
const user = await User.findByPk(fileOperation.userId, {
rejectOnEmpty: true,
});
const output: StructuredImportData = {
collections: [],
documents: [],
attachments: [],
};
const docPathToIdMap = new Map<string, string>();
const parseNodeChildren = async (
children: FileTreeNode[],
collectionId: string,
parentDocumentId?: string
): Promise<void> => {
for (const child of children) {
// special case for folders of attachments - detect by content
if (child.children.length > 0 && this.isAttachmentFolder(child)) {
this.parseAttachmentFolder(child, output);
continue;
}
const id = randomUUID();
const { title, icon, text } = await sequelize.transaction(
async (transaction) =>
documentImporter({
mimeType: "text/markdown",
fileName: child.name,
content:
child.children.length > 0
? ""
: await fs.readFile(child.path, "utf8"),
user,
ctx: createContext({ user, transaction }),
})
);
const existingDocumentIndex = output.documents.findIndex(
(doc) =>
doc.title === title &&
doc.collectionId === collectionId &&
doc.parentDocumentId === parentDocumentId
);
const existingDocument = output.documents[existingDocumentIndex];
// When there is a file and a folder with the same name this handles
// the case by combining the two into one document with nested children
if (existingDocument) {
docPathToIdMap.set(child.path, existingDocument.id);
if (existingDocument.text === "") {
output.documents[existingDocumentIndex].text = text;
}
await parseNodeChildren(
child.children,
collectionId,
existingDocument.id
);
} else {
docPathToIdMap.set(child.path, id);
output.documents.push({
id,
title,
icon,
text,
collectionId,
parentDocumentId,
path: child.path,
mimeType: "text/markdown",
});
await parseNodeChildren(child.children, collectionId, id);
}
}
};
// All nodes in the root level should be collections
for (const node of tree) {
if (node.children.length > 0) {
// Check if this is an attachments-only folder at root level
if (this.isAttachmentFolder(node)) {
this.parseAttachmentFolder(node, output);
continue;
}
const collectionId = randomUUID();
output.collections.push({
id: collectionId,
name: node.title,
});
await parseNodeChildren(node.children, collectionId);
} else {
Logger.debug("task", `Unhandled file in zip: ${node.path}`, {
fileOperationId: fileOperation.id,
});
}
}
for (const document of output.documents) {
// Check all of the attachments we've created against urls in the text
// and replace them out with attachment redirect urls before continuing.
for (const attachment of output.attachments) {
const encodedPath = encodeURI(attachment.path);
const attachmentFileName = path.basename(attachment.path);
const reference = `<<${attachment.id}>>`;
// Pull the collection and subdirectory out of the path name, upload
// folders in an export are relative to the document itself.
// Support both legacy bucket names (uploads/public) and generic attachment folders.
let normalizedAttachmentPath = encodedPath
.replace(
new RegExp(`(.*)/${Buckets.uploads}/`),
`${Buckets.uploads}/`
)
.replace(new RegExp(`(.*)/${Buckets.public}/`), `${Buckets.public}/`);
// Also try normalizing to just the folder containing the attachment
// This handles arbitrary folder names like "attachments/"
const attachmentDir = path.basename(path.dirname(attachment.path));
const genericNormalizedPath = `${attachmentDir}/${encodeURI(attachmentFileName)}`;
document.text = document.text
.replace(new RegExp(escapeRegExp(encodedPath), "g"), reference)
.replace(
new RegExp(`\\.?/?${escapeRegExp(normalizedAttachmentPath)}`, "g"),
reference
)
.replace(
new RegExp(`\\.?/?${escapeRegExp(genericNormalizedPath)}`, "g"),
reference
);
// Handle markdown links that reference attachments via a path rooted
// at an "attachments" folder, optionally prefixed with "./", e.g.
// ./attachments/foo.png or ./attachments/sub/foo.png.
const segments = attachment.path.split(path.sep);
const attachmentsIdx = segments.findIndex(
(seg) => seg.toLowerCase() === "attachments"
);
if (attachmentsIdx >= 0) {
const relFromAttachments = segments.slice(attachmentsIdx).join("/");
document.text = document.text.replace(
new RegExp(
`\\.?/?${escapeRegExp(encodeURI(relFromAttachments))}`,
"g"
),
reference
);
}
}
const basePath = path.dirname(document.path);
// check internal document links in the text and replace them with placeholders.
// When persisting, the placeholders will be replaced with the right urls.
const internalLinks = [
...document.text.matchAll(/\[[^\]]+\]\(([^)]+\.md)\)/g),
];
internalLinks.forEach((match) => {
const referredDocPath = match[1];
const normalizedDocPath = decodeURI(
path.normalize(`${basePath}/${referredDocPath}`)
);
const referredDocId = docPathToIdMap.get(normalizedDocPath);
if (referredDocId) {
document.text = document.text.replace(
referredDocPath,
`<<${referredDocId}>>`
);
}
});
}
return output;
}
}
@@ -0,0 +1,116 @@
import {
rewriteAttachmentPaths,
rewriteInternalLinks,
} from "./MarkdownAPIImportTask";
describe("rewriteAttachmentPaths", () => {
it("replaces a direct encoded path with the placeholder", () => {
const out = rewriteAttachmentPaths(
"![alt](My%20Collection/attachments/foo.png)",
[{ id: "att-1", pathInZip: "My Collection/attachments/foo.png" }]
);
expect(out).toBe("![alt](<<att-1>>)");
});
it("normalizes legacy `uploads/` bucket layout", () => {
const out = rewriteAttachmentPaths("![x](./uploads/abc/file.png)", [
{
id: "att-2",
pathInZip: "Some Collection/uploads/abc/file.png",
},
]);
expect(out).toBe("![x](<<att-2>>)");
});
it("normalizes legacy `public/` bucket layout", () => {
const out = rewriteAttachmentPaths("![x](./public/abc/file.png)", [
{
id: "att-3",
pathInZip: "Some Collection/public/abc/file.png",
},
]);
expect(out).toBe("![x](<<att-3>>)");
});
it("handles arbitrary folder names like 'attachments/'", () => {
const out = rewriteAttachmentPaths("![x](./attachments/foo.png)", [
{ id: "att-4", pathInZip: "Collection/attachments/foo.png" },
]);
expect(out).toBe("![x](<<att-4>>)");
});
it("matches nested attachments folders", () => {
const out = rewriteAttachmentPaths("![x](./attachments/sub/bar.png)", [
{
id: "att-5",
pathInZip: "Collection/Doc/attachments/sub/bar.png",
},
]);
expect(out).toBe("![x](<<att-5>>)");
});
it("substitutes multiple references in the same document", () => {
const out = rewriteAttachmentPaths(
"![a](./attachments/a.png) and ![b](./attachments/b.png)",
[
{ id: "id-a", pathInZip: "C/attachments/a.png" },
{ id: "id-b", pathInZip: "C/attachments/b.png" },
]
);
expect(out).toBe("![a](<<id-a>>) and ![b](<<id-b>>)");
});
it("is a no-op when no attachments match", () => {
const out = rewriteAttachmentPaths("![x](https://example.com/a.png)", [
{ id: "id-a", pathInZip: "C/attachments/a.png" },
]);
expect(out).toBe("![x](https://example.com/a.png)");
});
});
describe("rewriteInternalLinks", () => {
it("rewrites a sibling .md link to a placeholder", () => {
const out = rewriteInternalLinks(
"see [other](./other.md)",
"Collection/parent.md",
{ "Collection/other.md": "doc-1" }
);
expect(out).toBe("see [other](<<doc-1>>)");
});
it("rewrites a nested .md link", () => {
const out = rewriteInternalLinks(
"see [child](./sub/child.md)",
"Collection/parent.md",
{ "Collection/sub/child.md": "doc-2" }
);
expect(out).toBe("see [child](<<doc-2>>)");
});
it("leaves unresolved .md links untouched", () => {
const out = rewriteInternalLinks(
"see [missing](./missing.md)",
"Collection/parent.md",
{}
);
expect(out).toBe("see [missing](./missing.md)");
});
it("ignores non-md links", () => {
const out = rewriteInternalLinks(
"see [site](https://example.com)",
"Collection/parent.md",
{ "Collection/parent.md": "doc-self" }
);
expect(out).toBe("see [site](https://example.com)");
});
it("decodes encoded path segments before lookup", () => {
const out = rewriteInternalLinks(
"see [other](./My%20Doc.md)",
"Collection/parent.md",
{ "Collection/My Doc.md": "doc-3" }
);
expect(out).toBe("see [other](<<doc-3>>)");
});
});
@@ -0,0 +1,672 @@
import path from "node:path";
import { randomUUID } from "node:crypto";
import { escapeRegExp } from "es-toolkit/compat";
import fs from "fs-extra";
import mime from "mime-types";
import { UniqueConstraintError } from "sequelize";
import tmp from "tmp";
import type {
ImportTaskInput,
ImportTaskOutput,
MarkdownAttachmentManifestItem,
MarkdownPageImportTaskInputItem,
} from "@shared/schema";
import type { IntegrationService, ProsemirrorDoc } from "@shared/types";
import { AttachmentPreset } from "@shared/types";
import attachmentCreator from "@server/commands/attachmentCreator";
import { createContext } from "@server/context";
import env from "@server/env";
import Logger from "@server/logging/Logger";
import type { ImportTask } from "@server/models";
import { Attachment } from "@server/models";
import { Buckets } from "@server/models/helpers/AttachmentHelper";
import { ProsemirrorHelper } from "@server/models/helpers/ProsemirrorHelper";
import { sequelize } from "@server/storage/database";
import FileStorage from "@server/storage/files";
import type { FileTreeNode } from "@server/utils/ImportHelper";
import ImportHelper from "@server/utils/ImportHelper";
import ZipHelper from "@server/utils/ZipHelper";
import type { ProcessOutput } from "./APIImportTask";
import APIImportTask from "./APIImportTask";
import { DocumentConverter } from "@server/utils/DocumentConverter";
type Markdown = IntegrationService.Markdown;
interface ExtractedZip {
dirPath: string;
cleanup: () => Promise<void>;
}
interface DiscoveredDocument {
id: string;
title: string;
pathInZip: string;
collectionId: string;
parentDocumentId?: string;
markdownText: string;
children: DiscoveredDocument[];
}
interface DiscoveredCollection {
id: string;
title: string;
children: DiscoveredDocument[];
}
interface AttachmentRef {
id: string;
pathInZip: string;
}
/**
* Rewrites local attachment paths in markdown text into `<<attachmentId>>`
* placeholders. Supports legacy bucket layouts (`uploads/`, `public/`),
* arbitrary nested folder names, and `./attachments/...` rooted paths. Both
* encoded and unencoded path forms are matched.
*
* Exported for tests; not part of the module's public surface.
*
* @param markdown The raw markdown text from a single document.
* @param attachments Attachment manifest entries to substitute.
* @returns Markdown text with local paths replaced by `<<id>>` references.
*/
export function rewriteAttachmentPaths(
markdown: string,
attachments: AttachmentRef[]
): string {
let text = markdown;
for (const attachment of attachments) {
const encodedPath = encodeURI(attachment.pathInZip);
const attachmentFileName = path.basename(attachment.pathInZip);
const reference = `<<${attachment.id}>>`;
const normalizedAttachmentPath = encodedPath
.replace(new RegExp(`(.*)/${Buckets.uploads}/`), `${Buckets.uploads}/`)
.replace(new RegExp(`(.*)/${Buckets.public}/`), `${Buckets.public}/`);
const attachmentDir = path.basename(path.dirname(attachment.pathInZip));
const genericNormalizedPath = `${attachmentDir}/${encodeURI(attachmentFileName)}`;
text = text
.replace(new RegExp(escapeRegExp(encodedPath), "g"), reference)
.replace(
new RegExp(`\\.?/?${escapeRegExp(normalizedAttachmentPath)}`, "g"),
reference
);
const segments = attachment.pathInZip.split(path.sep);
const attachmentsIdx = segments.findIndex(
(seg) => seg.toLowerCase() === "attachments"
);
if (attachmentsIdx >= 0) {
const relFromAttachments = segments.slice(attachmentsIdx).join("/");
text = text.replace(
new RegExp(`\\.?/?${escapeRegExp(encodeURI(relFromAttachments))}`, "g"),
reference
);
}
text = text.replace(
new RegExp(`\\.?/?${escapeRegExp(genericNormalizedPath)}`, "g"),
reference
);
}
return text;
}
/**
* Rewrites internal markdown links (`[label](./relative.md)`) into
* `<<documentId>>` placeholders, resolved against a path → id map built from
* the zip's full document tree.
*
* Exported for tests; not part of the module's public surface.
*
* @param markdown The raw markdown text from a single document.
* @param documentPath Zip-relative path of the document being rewritten
* (e.g. `Collection/parent.md`); used as the base for
* resolving relative link targets against docMap keys.
* @param docMap Map of document path (as it appeared in the zip) to its
* pre-assigned externalId.
* @returns Markdown text with internal `.md` link targets replaced by
* `<<id>>` references.
*/
export function rewriteInternalLinks(
markdown: string,
documentPath: string,
docMap: Record<string, string>
): string {
const basePath = path.dirname(documentPath);
const internalLinks = [...markdown.matchAll(/\[[^\]]+\]\(([^)]+\.md)\)/g)];
let text = markdown;
for (const match of internalLinks) {
const referredDocPath = match[1];
const normalizedDocPath = decodeURI(
path.normalize(`${basePath}/${referredDocPath}`)
);
const referredDocId = docMap[normalizedDocPath];
if (referredDocId) {
text = text.replace(referredDocPath, `<<${referredDocId}>>`);
}
}
return text;
}
export default class MarkdownAPIImportTask extends APIImportTask<Markdown> {
protected shouldUploadAttachmentsPerPage(): boolean {
return false;
}
protected async scheduleNextTask(importTask: ImportTask<Markdown>) {
await new MarkdownAPIImportTask().schedule({ importTaskId: importTask.id });
}
protected async onAllTasksCompleted(
lastImportTask: ImportTask<Markdown>
): Promise<void> {
const scratch = lastImportTask.import.scratch;
if (!scratch?.storageKey || !scratch.manifest?.length) {
return;
}
const { dirPath, cleanup } = await this.downloadAndExtract(
scratch.storageKey
);
try {
const createdBy = lastImportTask.import.createdBy;
for (const item of scratch.manifest) {
const filePath = path.join(dirPath, item.pathInZip);
let buffer: Buffer;
try {
buffer = await fs.readFile(filePath);
} catch (err) {
Logger.warn(
`Markdown import attachment missing in zip, skipping: ${item.pathInZip}`,
err instanceof Error ? err : undefined
);
continue;
}
try {
await sequelize.transaction(async (transaction) =>
attachmentCreator({
source: "import",
preset: AttachmentPreset.DocumentAttachment,
id: item.id,
name: item.name,
type: item.mimeType,
buffer,
user: createdBy,
ctx: createContext({ user: createdBy, transaction }),
fetchOptions: {
timeout: env.FILE_STORAGE_IMPORT_TIMEOUT,
},
})
);
} catch (err) {
// Each attachment commits in its own transaction, so a retry of
// this hook can re-encounter ids that already landed. Treat the
// unique-id collision as a no-op so the import remains resumable.
if (err instanceof UniqueConstraintError) {
continue;
}
throw err;
}
}
} finally {
await cleanup();
}
}
protected async processBootstrap(
importTask: ImportTask<Markdown>
): Promise<ProcessOutput<Markdown>> {
const storageKey = importTask.import.scratch?.storageKey;
if (!storageKey) {
throw new Error("Markdown import is missing scratch.storageKey");
}
const { dirPath, cleanup } = await this.downloadAndExtract(storageKey);
try {
const tree = await ImportHelper.toFileTree(dirPath);
if (!tree) {
throw new Error("Could not find valid content in zip file");
}
const collections: DiscoveredCollection[] = [];
const manifest: MarkdownAttachmentManifestItem[] = [];
for (const node of tree.children) {
if (node.children.length === 0) {
Logger.debug("task", `Unhandled file in zip: ${node.path}`, {
importTaskId: importTask.id,
});
continue;
}
if (this.isAttachmentFolder(node)) {
this.collectAttachments(node, manifest, dirPath);
continue;
}
const collection: DiscoveredCollection = {
id: randomUUID(),
title: node.title,
children: [],
};
collections.push(collection);
await this.collectDocumentsAndAttachments({
children: node.children,
collectionId: collection.id,
out: collection.children,
manifest,
extractionRoot: dirPath,
});
}
// Build docMap (pathInZip -> externalId) for internal-link resolution.
// Walk the full document tree to collect every doc id, since internal
// markdown links can target any document regardless of depth.
const docMap: Record<string, string> = {};
const collectDocMap = (docs: DiscoveredDocument[]) => {
for (const d of docs) {
docMap[d.pathInZip] = d.id;
collectDocMap(d.children);
}
};
for (const c of collections) {
collectDocMap(c.children);
}
// Replace (not append) anything past the create-time placeholder with
// the freshly discovered collections so a retried bootstrap doesn't
// accumulate duplicate entries with fresh UUIDs from a previous
// partial run. ImportsProcessor's persistence pass treats these as
// collections.
const associatedImport = importTask.import;
const placeholder = associatedImport.input[0];
associatedImport.input = [
placeholder,
...collections.map((c) => ({
externalId: c.id,
permission: placeholder.permission,
})),
];
associatedImport.scratch = { storageKey, manifest };
await associatedImport.save();
// Append collection placeholder items so ImportsProcessor iterates
// them during the bootstrap row (the earliest createdAt) — that
// guarantees collections land in the DB before any per-page document
// references them.
const collectionInputItems: MarkdownPageImportTaskInputItem[] =
collections.map((c) => ({
externalId: c.id,
title: c.title,
path: c.title,
markdownText: "",
attachmentMap: [],
docMap: {},
}));
importTask.input = [importTask.input[0], ...collectionInputItems];
const collectionOutputs: ImportTaskOutput = collections.map((c) => ({
externalId: c.id,
title: c.title,
content: ProsemirrorHelper.getEmptyDocument() as ProsemirrorDoc,
}));
// First wave of document tasks: only top-level docs in each collection.
// Each carries its descendants in `children` and the per-page handler
// re-emits them as the next wave of childTasksInput, producing a strict
// depth-ordered cascade of ImportTask rows so parent FKs are always
// satisfied at child-doc creation time.
const childTasksInput: ImportTaskInput<Markdown> = collections.flatMap(
(c) => c.children.map((d) => this.toPageInput(d, manifest, docMap))
);
return { taskOutput: collectionOutputs, childTasksInput };
} finally {
await cleanup();
}
}
/**
* Converts a discovered document subtree into a per-page task input,
* recursively packing the doc's descendants into the `children` field so
* each tree-depth runs as its own task wave.
*
* @param doc The discovered document, including its descendants.
* @param manifest The full attachment manifest (used for per-page refs).
* @param docMap Path → externalId map for internal link rewriting.
* @returns A self-contained per-page task input.
*/
private toPageInput(
doc: DiscoveredDocument,
manifest: MarkdownAttachmentManifestItem[],
docMap: Record<string, string>
): MarkdownPageImportTaskInputItem {
return {
externalId: doc.id,
parentExternalId: doc.parentDocumentId,
collectionExternalId: doc.collectionId,
title: doc.title,
path: doc.pathInZip,
markdownText: doc.markdownText,
attachmentMap: this.attachmentsReferencedBy(doc.markdownText, manifest),
docMap,
children: doc.children.length
? doc.children.map((c) => this.toPageInput(c, manifest, docMap))
: undefined,
};
}
protected async processPage(
importTask: ImportTask<Markdown>
): Promise<ProcessOutput<Markdown>> {
const taskOutput: ImportTaskOutput = [];
const childTasksInput: MarkdownPageImportTaskInputItem[] = [];
const items = importTask.input as MarkdownPageImportTaskInputItem[];
for (const item of items) {
// Empty markdown short-circuits — used by collection placeholders so
// ImportsProcessor sees their externalId paired with empty content and
// builds a Collection rather than a Document. (Currently collections
// are persisted via the bootstrap task itself, so this branch is only
// a defensive fallback.)
if (!item.markdownText) {
taskOutput.push({
externalId: item.externalId,
title: item.title,
content: ProsemirrorHelper.getEmptyDocument() as ProsemirrorDoc,
});
} else {
const transformedMarkdown = this.rewriteMarkdown(item);
const { doc, title, icon } = await DocumentConverter.convert(
transformedMarkdown,
path.basename(item.path),
"text/markdown"
);
taskOutput.push({
externalId: item.externalId,
title: title || item.title,
icon,
content: doc.toJSON() as ProsemirrorDoc,
});
}
// Cascade this doc's direct descendants as the next task wave. Their
// ImportTask rows will be created after the current one returns, so
// their createdAt is strictly later — guaranteeing parent-before-child
// FK ordering during ImportsProcessor's persistence pass.
if (item.children?.length) {
childTasksInput.push(...item.children);
}
}
return { taskOutput, childTasksInput };
}
/**
* Pre-rewrites a page's markdown text. Internal `.md` links become mention
* markdown so the editor parses them as Document mentions. Attachment paths
* are first reduced to `<<id>>` placeholders by the shared rewriter, then
* — distinct from the prosemirror-tree walk we used to do — substituted
* with their final attachment redirect URLs in the markdown text. Doing
* the resolution at the text layer avoids markdown-it parsing `<<id>>` as
* an angle-bracket-wrapped URL (which produced broken image src attrs).
*
* @param page The per-page task input.
* @returns Rewritten markdown text ready for DocumentConverter.
*/
private rewriteMarkdown(page: MarkdownPageImportTaskInputItem): string {
let text = rewriteInternalLinks(page.markdownText, page.path, page.docMap);
// Convert `[label](<<id>>)` links from rewriteInternalLinks into mention
// markdown the editor recognises: `@[label](mention://<uuid>/document/<id>)`.
text = text.replace(
/\[([^\]]+)\]\(<<([^>]+)>>\)/g,
(_full, label: string, externalId: string) =>
`@[${label}](mention://${randomUUID()}/document/${externalId})`
);
text = rewriteAttachmentPaths(
text,
page.attachmentMap.map((m) => ({ id: m.id, pathInZip: m.pathInZip }))
);
// Resolve remaining `<<id>>` placeholders to attachment redirect URLs.
text = text.replace(/<<([^>]+)>>/g, (_full, id: string) =>
Attachment.getRedirectUrl(id)
);
return text;
}
/**
* Returns the subset of the full manifest that is referenced anywhere in
* the given markdown text. Used to bound the per-page task input size.
*
* @param markdown Raw markdown text for a single document.
* @param manifest The full attachment manifest from the bootstrap phase.
* @returns Manifest entries that appear (by filename) in the markdown.
*/
private attachmentsReferencedBy(
markdown: string,
manifest: MarkdownAttachmentManifestItem[]
): MarkdownAttachmentManifestItem[] {
return manifest.filter((item) => {
const fileName = path.basename(item.pathInZip);
return (
markdown.includes(fileName) || markdown.includes(encodeURI(fileName))
);
});
}
/**
* Detects folders containing only attachments (no markdown documents).
* Recursively considers nested folders; mirrors the legacy heuristic.
*
* @param node FileTreeNode to inspect.
* @returns true when the folder appears to hold only attachments.
*/
private isAttachmentFolder(node: FileTreeNode): boolean {
if (node.children.length === 0) {
return false;
}
if (node.title.toLowerCase() === "attachments") {
return true;
}
return node.children.every((child) => {
if (child.children.length > 0) {
return this.isAttachmentFolder(child);
}
const ext = path.extname(child.name).toLowerCase();
if (!ext) {
return false;
}
return ext !== ".md" && ext !== ".markdown";
});
}
/**
* Recursively collects all files under an attachment-only folder into the
* manifest. `pathInZip` is stored as a path relative to the extraction
* root so it can be resolved again after the zip is re-extracted during
* the completion phase (which lands in a fresh tmp dir).
*
* @param node Attachment-folder FileTreeNode.
* @param manifest Manifest array to push entries into.
* @param extractionRoot Absolute path to the zip extraction root.
*/
private collectAttachments(
node: FileTreeNode,
manifest: MarkdownAttachmentManifestItem[],
extractionRoot: string
): void {
for (const child of node.children) {
if (child.children.length > 0) {
this.collectAttachments(child, manifest, extractionRoot);
continue;
}
manifest.push({
id: randomUUID(),
name: child.name,
pathInZip: path.relative(extractionRoot, child.path),
mimeType: mime.lookup(child.path) || "application/octet-stream",
});
}
}
/**
* Walks a collection subtree and gathers documents (markdown files) and
* loose attachments. Documents are appended to `out` as a tree — each
* entry's `children` holds its direct descendants. This is the shape the
* per-page task cascade consumes.
*
* @param children FileTreeNode children of the current folder.
* @param collectionId Pre-assigned id of the enclosing collection.
* @param parentDocumentId Optional parent document id when nested.
* @param out Sibling accumulator to push discovered documents into.
* @param manifest Attachment manifest accumulator.
* @returns Promise that resolves when the subtree has been processed.
*/
private async collectDocumentsAndAttachments({
children,
collectionId,
parentDocumentId,
out,
manifest,
extractionRoot,
}: {
children: FileTreeNode[];
collectionId: string;
parentDocumentId?: string;
out: DiscoveredDocument[];
manifest: MarkdownAttachmentManifestItem[];
extractionRoot: string;
}): Promise<void> {
for (const child of children) {
if (child.children.length > 0 && this.isAttachmentFolder(child)) {
this.collectAttachments(child, manifest, extractionRoot);
continue;
}
const ext = path.extname(child.name).toLowerCase();
const isMarkdown = ext === ".md" || ext === ".markdown";
const isFolder = child.children.length > 0;
if (!isMarkdown && !isFolder) {
manifest.push({
id: randomUUID(),
name: child.name,
pathInZip: path.relative(extractionRoot, child.path),
mimeType: mime.lookup(child.path) || "application/octet-stream",
});
continue;
}
const id = randomUUID();
const markdownText = isFolder
? ""
: await fs.readFile(child.path, "utf8");
// Folder-and-file with the same title (a "name.md" alongside a "name/"
// directory) is merged onto a single document: the folder body picks up
// the file's markdown text, and the folder's contents become children.
const sibling = out.find((d) => d.title === child.title);
if (sibling) {
if (sibling.markdownText === "" && markdownText) {
sibling.markdownText = markdownText;
}
if (isFolder) {
await this.collectDocumentsAndAttachments({
children: child.children,
collectionId,
parentDocumentId: sibling.id,
out: sibling.children,
manifest,
extractionRoot,
});
}
continue;
}
const node: DiscoveredDocument = {
id,
title: child.title,
pathInZip: path.relative(extractionRoot, child.path),
collectionId,
parentDocumentId,
markdownText,
children: [],
};
out.push(node);
if (isFolder) {
await this.collectDocumentsAndAttachments({
children: child.children,
collectionId,
parentDocumentId: id,
out: node.children,
manifest,
extractionRoot,
});
}
}
}
/**
* Downloads the zip from object storage and extracts it into a temporary
* directory.
*
* @param storageKey Storage key for the uploaded zip.
* @returns The temp dir path and a cleanup callback. Caller must invoke
* cleanup() once finished.
*/
private async downloadAndExtract(storageKey: string): Promise<ExtractedZip> {
const handle = await FileStorage.getFileHandle(storageKey);
let dirPath: string | undefined;
try {
dirPath = await new Promise<string>((resolve, reject) => {
tmp.dir({ unsafeCleanup: true }, (err, tmpDir) => {
if (err) {
reject(err);
return;
}
resolve(tmpDir);
});
});
await ZipHelper.extract(handle.path, dirPath);
return {
dirPath,
cleanup: async () => {
await fs
.rm(dirPath!, { recursive: true, force: true })
.catch(() => {});
await handle.cleanup().catch(() => {});
},
};
} catch (err) {
if (dirPath) {
await fs.rm(dirPath, { recursive: true, force: true }).catch(() => {});
}
await handle.cleanup().catch(() => {});
throw err;
}
}
}
+4 -2
View File
@@ -76,9 +76,11 @@ export const CollectionsImportSchema = BaseSchema.extend({
.nullish()
.transform((val) => (isUndefined(val) ? null : val)),
attachmentId: z.uuid(),
// Markdown zip imports now run through `imports.create` →
// MarkdownAPIImportTask, so only JSON is accepted here.
format: z
.enum(FileOperationFormat)
.prefault(FileOperationFormat.MarkdownZip),
.literal(FileOperationFormat.JSON)
.prefault(FileOperationFormat.JSON),
}),
});
+40 -7
View File
@@ -1,15 +1,16 @@
import Router from "koa-router";
import { randomUUID } from "node:crypto";
import { truncate } from "es-toolkit/compat";
import type { WhereOptions } from "sequelize";
import type { IntegrationType } from "@shared/types";
import { ImportState, UserRole } from "@shared/types";
import { ImportState, IntegrationService, UserRole } from "@shared/types";
import { ImportValidation } from "@shared/validations";
import { UnprocessableEntityError } from "@server/errors";
import auth from "@server/middlewares/authentication";
import { rateLimiter } from "@server/middlewares/rateLimiter";
import { transaction } from "@server/middlewares/transaction";
import validate from "@server/middlewares/validate";
import { Integration } from "@server/models";
import { Attachment, Integration } from "@server/models";
import Import from "@server/models/Import";
import { authorize } from "@server/policies";
import { presentImport, presentPolicies } from "@server/presenters";
@@ -27,7 +28,7 @@ router.post(
validate(T.ImportsCreateSchema),
transaction(),
async (ctx: APIContext<T.ImportsCreateReq>) => {
const { integrationId, service, input } = ctx.input.body;
const body = ctx.input.body;
const { user } = ctx.state.auth;
authorize(user, "createImport", user.team);
@@ -47,9 +48,41 @@ router.post(
throw UnprocessableEntityError("An import is already in progress");
}
if (body.service === IntegrationService.Markdown) {
const attachment = await Attachment.findByPk(body.attachmentId, {
rejectOnEmpty: true,
});
authorize(user, "read", attachment);
const importModel = await Import.createWithCtx(ctx, {
name: truncate(attachment.name, {
length: ImportValidation.maxNameLength,
}),
service: IntegrationService.Markdown,
state: ImportState.Created,
input: [
{
externalId: randomUUID(),
permission: body.permission,
},
],
scratch: { storageKey: attachment.key },
integrationId: null,
createdById: user.id,
teamId: user.teamId,
});
importModel.createdBy = user;
ctx.body = {
data: presentImport(importModel),
policies: presentPolicies(user, [importModel]),
};
return;
}
const integration = await Integration.findByPk<
Integration<IntegrationType.Import>
>(integrationId, {
>(body.integrationId, {
rejectOnEmpty: true,
});
authorize(user, "read", integration);
@@ -58,10 +91,10 @@ router.post(
const importModel = await Import.createWithCtx(ctx, {
name: truncate(name, { length: ImportValidation.maxNameLength }),
service,
service: body.service,
state: ImportState.Created,
input,
integrationId,
input: body.input,
integrationId: body.integrationId,
createdById: user.id,
teamId: user.teamId,
});
+6
View File
@@ -1,6 +1,7 @@
import { z } from "zod";
import { NotionImportInputItemSchema } from "@shared/schema";
import {
CollectionPermission,
ImportableIntegrationService,
IntegrationService,
} from "@shared/types";
@@ -37,6 +38,11 @@ export const ImportsCreateSchema = BaseSchema.extend({
service: z.literal(IntegrationService.Notion),
input: z.array(NotionImportInputItemSchema),
}),
z.object({
service: z.literal(IntegrationService.Markdown),
attachmentId: z.uuid(),
permission: z.enum(CollectionPermission).optional(),
}),
]),
});
+87 -3
View File
@@ -20,8 +20,20 @@ export const NotionImportInputItemSchema = BaseImportInputItemSchema.extend({
export type NotionImportInput = z.infer<typeof NotionImportInputItemSchema>[];
export const MarkdownImportInputItemSchema = BaseImportInputItemSchema.extend({
externalId: z.string(),
});
export type MarkdownImportInput = z.infer<
typeof MarkdownImportInputItemSchema
>[];
export type ImportInput<T extends ImportableIntegrationService> =
T extends IntegrationService.Notion ? NotionImportInput : BaseImportInput;
T extends IntegrationService.Notion
? NotionImportInput
: T extends IntegrationService.Markdown
? MarkdownImportInput
: BaseImportInput;
export const BaseImportTaskInputItemSchema = z.object({
externalId: z.string(),
@@ -42,16 +54,88 @@ export type NotionImportTaskInput = z.infer<
typeof NotionImportTaskInputItemSchema
>[];
/**
* Manifest entry describing a single attachment discovered during the
* Markdown zip bootstrap phase. The `id` is a pre-assigned UUID used both
* as the attachment node id in per-page prosemirror output and as the
* Attachment row id created during completion.
*/
export const MarkdownAttachmentManifestItemSchema = z.object({
id: z.uuid(),
name: z.string(),
mimeType: z.string(),
pathInZip: z.string(),
});
export type MarkdownAttachmentManifestItem = z.infer<
typeof MarkdownAttachmentManifestItemSchema
>;
/**
* Markdown importer scratch state. `storageKey` is set at import creation
* (it's the only durable handle on the uploaded zip). `manifest` is added
* by the bootstrap phase so the completion phase can re-download the zip
* and create Attachment rows without re-walking the tree.
*/
export interface MarkdownImportScratch {
storageKey: string;
manifest?: MarkdownAttachmentManifestItem[];
}
/**
* Per-importer scratch shape stored on `Import.scratch`. Holds cross-phase
* state that the importer needs between bootstrap and completion but that
* isn't part of any single task's input. Cleared when the import flips to
* `Processed`.
*/
export type ImportScratch<T extends ImportableIntegrationService> =
T extends IntegrationService.Markdown ? MarkdownImportScratch : never;
/**
* Per-page task input. Generated by the bootstrap task and consumed by
* subsequent MarkdownAPIImportTask runs. `children` carries this document's
* direct descendants so that each level of the document tree is scheduled
* as a separate task wave; this preserves parent-before-child ordering
* during persistence (createdAt of child tasks is strictly later than
* parents'). The type is defined as a TypeScript interface rather than via
* z.infer because it is only consumed internally — never validated at an
* API boundary — and zod's recursive-schema ergonomics aren't worth the
* cost here.
*/
export interface MarkdownPageImportTaskInputItem {
externalId: string;
parentExternalId?: string;
collectionExternalId?: string;
title: string;
path: string;
markdownText: string;
attachmentMap: MarkdownAttachmentManifestItem[];
docMap: Record<string, string>;
children?: MarkdownPageImportTaskInputItem[];
}
/**
* Markdown import task input — a bootstrap row carrying only the base
* placeholder item (the zip's `storageKey` lives on `Import.scratch`), or a
* page row carrying per-document content.
*/
export type MarkdownImportTaskInput = (
| BaseImportTaskInput[number]
| MarkdownPageImportTaskInputItem
)[];
export type ImportTaskInput<T extends ImportableIntegrationService> =
T extends IntegrationService.Notion
? NotionImportTaskInput
: BaseImportTaskInput;
: T extends IntegrationService.Markdown
? MarkdownImportTaskInput
: BaseImportTaskInput;
// No reason to be here except for co-location with import task input.
export type ImportTaskOutput = {
externalId: string;
title: string;
emoji?: string;
icon?: string;
author?: string;
content: ProsemirrorDoc;
createdAt?: Date;
+18 -1
View File
@@ -97,6 +97,21 @@ export enum ImportTaskState {
Canceled = "canceled",
}
/**
* Classifies the work an `ImportTask` row represents. Set when the task is
* created and used by `APIImportTask` to dispatch to the right handler.
*
* - `Bootstrap` runs once per import on a worker that owns the source
* artifact (e.g. extracts a zip, discovers structure, schedules child
* tasks). Subclasses without a bootstrap step never produce these.
* - `Page` is the per-document work that the bootstrap (or `ImportsProcessor`
* for sources without a bootstrap, like Notion) fans out into.
*/
export enum ImportTaskPhase {
Bootstrap = "bootstrap",
Page = "page",
}
export enum MentionType {
User = "user",
Document = "document",
@@ -151,15 +166,17 @@ export enum IntegrationService {
Linear = "linear",
Figma = "figma",
Notion = "notion",
Markdown = "markdown",
}
export type ImportableIntegrationService = Extract<
IntegrationService,
IntegrationService.Notion
IntegrationService.Notion | IntegrationService.Markdown
>;
export const ImportableIntegrationService = {
Notion: IntegrationService.Notion,
Markdown: IntegrationService.Markdown,
} as const;
export type IssueTrackerIntegrationService = Extract<