Compare commits

...

5 Commits

Author SHA1 Message Date
Tom Moor b57bae00b9 translations 2025-04-01 20:40:49 -04:00
codegen-sh[bot] f19587fc15 Remove Notion import fixtures 2025-03-30 20:01:59 +00:00
codegen-sh[bot] e6c3d782da Restore Notion format references for backward compatibility 2025-03-30 18:57:21 +00:00
codegen-sh[bot] f3e5a39250 Fix Notion importer cleanup PR based on feedback 2025-03-30 18:54:29 +00:00
codegen-sh[bot] f15ac90930 Cleanup the old Notion importer 2025-03-29 14:47:59 +00:00
7 changed files with 0 additions and 486 deletions
@@ -1,36 +0,0 @@
import * as React from "react";
import { Trans, useTranslation } from "react-i18next";
import { FileOperationFormat } from "@shared/types";
import useStores from "~/hooks/useStores";
import DropToImport from "./DropToImport";
import HelpDisclosure from "./HelpDisclosure";
function ImportNotionDialog() {
const { t } = useTranslation();
const { dialogs } = useStores();
return (
<>
<HelpDisclosure title={<Trans>Where do I find the file?</Trans>}>
<Trans
defaults="In Notion, click <em>Settings & Members</em> in the left sidebar and open Settings. Look for the Export section, and click <em>Export all workspace content</em>. Choose <em>HTML</em> as the format for the best data compatability."
components={{
em: <strong />,
}}
/>
</HelpDisclosure>
<DropToImport
onSubmit={dialogs.closeAllModals}
format={FileOperationFormat.Notion}
>
<>
{t(
`Drag and drop the zip file from Notion's HTML export option, or click to upload`
)}
</>
</DropToImport>
</>
);
}
export default ImportNotionDialog;
@@ -6,7 +6,6 @@ import ExportJSONTask from "../tasks/ExportJSONTask";
import ExportMarkdownZipTask from "../tasks/ExportMarkdownZipTask";
import ImportJSONTask from "../tasks/ImportJSONTask";
import ImportMarkdownZipTask from "../tasks/ImportMarkdownZipTask";
import ImportNotionTask from "../tasks/ImportNotionTask";
import BaseProcessor from "./BaseProcessor";
export default class FileOperationCreatedProcessor extends BaseProcessor {
@@ -25,11 +24,6 @@ export default class FileOperationCreatedProcessor extends BaseProcessor {
fileOperationId: event.modelId,
});
break;
case FileOperationFormat.Notion:
await ImportNotionTask.schedule({
fileOperationId: event.modelId,
});
break;
case FileOperationFormat.JSON:
await ImportJSONTask.schedule({
fileOperationId: event.modelId,
@@ -1,87 +0,0 @@
/* eslint-disable @typescript-eslint/no-empty-function */
import path from "path";
import { FileOperation } from "@server/models";
import { buildFileOperation } from "@server/test/factories";
import ImportNotionTask from "./ImportNotionTask";
describe("ImportNotionTask", () => {
it("should import successfully from a Markdown export", async () => {
const fileOperation = await buildFileOperation();
Object.defineProperty(fileOperation, "handle", {
get() {
return {
path: path.resolve(
__dirname,
"..",
"..",
"test",
"fixtures",
"notion-markdown.zip"
),
cleanup: async () => {},
};
},
});
jest.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
const props = {
fileOperationId: fileOperation.id,
};
const task = new ImportNotionTask();
const response = await task.perform(props);
expect(response.collections.size).toEqual(2);
expect(response.documents.size).toEqual(6);
expect(response.attachments.size).toEqual(1);
// Check that the image url was replaced in the text with a redirect
const attachments = Array.from(response.attachments.values());
const documents = Array.from(response.documents.values());
expect(documents.map((d) => d.text).join("")).toContain(
attachments[0].redirectUrl
);
});
it("should import successfully from a HTML export", async () => {
const fileOperation = await buildFileOperation();
Object.defineProperty(fileOperation, "handle", {
get() {
return {
path: path.resolve(
__dirname,
"..",
"..",
"test",
"fixtures",
"notion-html.zip"
),
cleanup: async () => {},
};
},
});
jest.spyOn(FileOperation, "findByPk").mockResolvedValue(fileOperation);
const props = {
fileOperationId: fileOperation.id,
};
const task = new ImportNotionTask();
const response = await task.perform(props);
expect(response.collections.size).toEqual(2);
expect(response.documents.size).toEqual(6);
expect(response.attachments.size).toEqual(4);
// Check that the image url was replaced in the text with a redirect
const attachments = Array.from(response.attachments.values());
const attachment = attachments.find((att) =>
att.key.endsWith("Screen_Shot_2022-04-21_at_2.23.26_PM.png")
);
const documents = Array.from(response.documents.values());
expect(documents.map((d) => d.text).join("")).toContain(
attachment?.redirectUrl
);
});
});
-354
View File
@@ -1,354 +0,0 @@
import path from "path";
import fs from "fs-extra";
import compact from "lodash/compact";
import escapeRegExp from "lodash/escapeRegExp";
import mime from "mime-types";
import { v4 as uuidv4 } from "uuid";
import documentImporter from "@server/commands/documentImporter";
import { createContext } from "@server/context";
import Logger from "@server/logging/Logger";
import { FileOperation, User } from "@server/models";
import { sequelize } from "@server/storage/database";
import ImportHelper, { FileTreeNode } from "@server/utils/ImportHelper";
import ImportTask, { StructuredImportData } from "./ImportTask";
export default class ImportNotionTask extends ImportTask {
public async parseData(
dirPath: string,
fileOperation: FileOperation
): Promise<StructuredImportData> {
const tree = await ImportHelper.toFileTree(dirPath);
if (!tree) {
throw new Error("Could not find valid content in zip file");
}
// New Notion exports have a single folder with the name of the export, we must skip this
// folder and go directly to the children.
let parsed;
if (
tree.children.length === 1 &&
tree.children[0].children.find((child) => child.title === "index")
) {
parsed = await this.parseFileTree(
fileOperation,
tree.children[0].children.filter((child) => child.title !== "index")
);
} else {
parsed = await this.parseFileTree(fileOperation, tree.children);
}
if (parsed.documents.length === 0 && parsed.collections.length === 1) {
const collection = parsed.collections[0];
const collectionId = uuidv4();
if (collection.description) {
parsed.documents.push({
title: collection.name,
icon: collection.icon,
color: collection.color,
path: "",
text: String(collection.description),
id: collection.id,
externalId: collection.externalId,
mimeType: "text/html",
collectionId,
});
}
collection.name = "Notion";
collection.icon = undefined;
collection.color = undefined;
collection.externalId = undefined;
collection.description = undefined;
collection.id = collectionId;
}
return parsed;
}
/**
* Converts the file structure from zipAsFileTree into documents,
* collections, and attachments.
*
* @param fileOperation The file operation
* @param tree An array of FileTreeNode representing root files in the zip
* @returns A StructuredImportData object
*/
private async parseFileTree(
fileOperation: FileOperation,
tree: FileTreeNode[]
): Promise<StructuredImportData> {
const user = await User.findByPk(fileOperation.userId, {
rejectOnEmpty: true,
});
const output: StructuredImportData = {
collections: [],
documents: [],
attachments: [],
};
const parseNodeChildren = async (
children: FileTreeNode[],
collectionId: string,
parentDocumentId?: string
): Promise<void> => {
await Promise.all(
children.map(async (child) => {
// Ignore the CSV's for databases upfront
if (child.path.endsWith(".csv")) {
return;
}
const id = uuidv4();
const match = child.title.match(this.NotionUUIDRegex);
const name = child.title.replace(this.NotionUUIDRegex, "");
const externalId = match ? match[0].trim() : undefined;
// If it's not a text file we're going to treat it as an attachment.
const mimeType = mime.lookup(child.name);
const isDocument =
mimeType === "text/markdown" ||
mimeType === "text/plain" ||
mimeType === "text/html";
// If it's not a document and not a folder, treat it as an attachment
if (!isDocument && mimeType) {
output.attachments.push({
id,
name: child.name,
path: child.path,
mimeType,
buffer: () => fs.readFile(child.path),
externalId,
});
return;
}
Logger.debug("task", `Processing ${name} as ${mimeType}`);
const { title, icon, text } = await sequelize.transaction(
async (transaction) =>
documentImporter({
mimeType: mimeType || "text/markdown",
fileName: name,
content:
child.children.length > 0
? ""
: await fs.readFile(child.path, "utf8"),
user,
ctx: createContext({ user, transaction }),
})
);
const existingDocumentIndex = output.documents.findIndex(
(doc) => doc.externalId === externalId
);
const existingDocument = output.documents[existingDocumentIndex];
// If there is an existing document with the same externalId that means
// we've already parsed either a folder or a file referencing the same
// document, as such we should merge.
if (existingDocument) {
if (existingDocument.text === "") {
output.documents[existingDocumentIndex].text = text;
}
await parseNodeChildren(
child.children,
collectionId,
existingDocument.id
);
} else {
output.documents.push({
id,
title,
icon,
text,
collectionId,
parentDocumentId,
path: child.path,
mimeType: mimeType || "text/markdown",
externalId,
});
await parseNodeChildren(child.children, collectionId, id);
}
})
);
};
const replaceInternalLinksAndImages = (text: string) => {
// Find if there are any images in this document
const imagesInText = this.parseImages(text);
for (const image of imagesInText) {
const name = path.basename(image.src);
const attachment = output.attachments.find(
(att) =>
att.path.endsWith(image.src) ||
encodeURI(att.path).endsWith(image.src)
);
if (!attachment) {
if (!image.src.startsWith("http")) {
Logger.info(
"task",
`Could not find referenced attachment with name ${name} and src ${image.src}`
);
}
} else {
text = text.replace(
new RegExp(escapeRegExp(image.src), "g"),
`<<${attachment.id}>>`
);
}
}
// With Notion's HTML import, images sometimes come wrapped in anchor tags
// This isn't supported in Outline's editor, so we need to strip them.
text = text.replace(/\[!\[([^[]+)]/g, "![]");
// Find if there are any links in this document pointing to other documents
const internalLinksInText = this.parseInternalLinks(text);
// For each link update to the standardized format of <<documentId>>
// instead of a relative or absolute URL within the original zip file.
for (const link of internalLinksInText) {
const doc = output.documents.find(
(doc) => doc.externalId === link.externalId
);
if (!doc) {
Logger.info(
"task",
`Could not find referenced document with externalId ${link.externalId}`
);
} else {
text = text.replace(link.href, `<<${doc.id}>>`);
}
}
return text;
};
// All nodes in the root level should become collections
for (const node of tree) {
const match = node.title.match(this.NotionUUIDRegex);
const name = node.title.replace(this.NotionUUIDRegex, "");
const externalId = match ? match[0].trim() : undefined;
const mimeType = mime.lookup(node.name);
const existingCollectionIndex = output.collections.findIndex(
(collection) => collection.externalId === externalId
);
const existingCollection = output.collections[existingCollectionIndex];
const collectionId = existingCollection?.id || uuidv4();
let description;
// Root level docs become the descriptions of collections
if (
mimeType === "text/markdown" ||
mimeType === "text/plain" ||
mimeType === "text/html"
) {
const { text } = await sequelize.transaction(async (transaction) =>
documentImporter({
mimeType,
fileName: name,
content: await fs.readFile(node.path, "utf8"),
user,
ctx: createContext({ user, transaction }),
})
);
description = text;
} else if (node.children.length > 0) {
await parseNodeChildren(node.children, collectionId);
} else {
Logger.debug("task", `Unhandled file in zip: ${node.path}`, {
fileOperationId: fileOperation.id,
});
continue;
}
if (existingCollectionIndex !== -1) {
if (description) {
output.collections[existingCollectionIndex].description = description;
}
} else {
output.collections.push({
id: collectionId,
name,
description,
externalId,
});
}
}
for (const document of output.documents) {
document.text = replaceInternalLinksAndImages(document.text);
}
for (const collection of output.collections) {
if (typeof collection.description === "string") {
collection.description = replaceInternalLinksAndImages(
collection.description
);
}
}
return output;
}
/**
* Extracts internal links from a markdown document, taking into account the
* externalId of the document, which is part of the link title.
*
* @param text The markdown text to parse
* @returns An array of internal links
*/
private parseInternalLinks(
text: string
): { title: string; href: string; externalId: string }[] {
return compact(
[...text.matchAll(this.NotionLinkRegex)].map((match) => ({
title: match[1],
href: match[2],
externalId: match[3],
}))
);
}
/**
* Extracts images from the markdown document
*
* @param text The markdown text to parse
* @returns An array of internal links
*/
private parseImages(text: string): { alt: string; src: string }[] {
return compact(
[...text.matchAll(this.ImageRegex)].map((match) => ({
alt: match[1],
src: match[2],
}))
);
}
/**
* Regex to find markdown images of all types
*/
private ImageRegex =
/!\[(?<alt>[^\][]*?)]\((?<filename>[^\][]*?)(?=“|\))“?(?<title>[^\][”]+)?”?\)/g;
/**
* Regex to find markdown links containing ID's that look like UUID's with the
* "-"'s removed, Notion's externalId format.
*/
private NotionLinkRegex = /\[([^[]+)]\((.*?([0-9a-fA-F]{32})\..*?)\)/g;
/**
* Regex to find Notion document UUID's in the title of a document.
*/
private NotionUUIDRegex =
/\s([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}|[0-9a-fA-F]{32})$/;
}
Binary file not shown.
Binary file not shown.
@@ -903,9 +903,6 @@
"{{ count }} document imported_plural": "{{ count }} documents imported",
"You can import a zip file that was previously exported from an Outline installation collections, documents, and images will be imported. In Outline, open <em>Export</em> in the Settings sidebar and click on <em>Export Data</em>.": "You can import a zip file that was previously exported from an Outline installation collections, documents, and images will be imported. In Outline, open <em>Export</em> in the Settings sidebar and click on <em>Export Data</em>.",
"Drag and drop the zip file from the Markdown export option in {{appName}}, or click to upload": "Drag and drop the zip file from the Markdown export option in {{appName}}, or click to upload",
"Where do I find the file?": "Where do I find the file?",
"In Notion, click <em>Settings & Members</em> in the left sidebar and open Settings. Look for the Export section, and click <em>Export all workspace content</em>. Choose <em>HTML</em> as the format for the best data compatability.": "In Notion, click <em>Settings & Members</em> in the left sidebar and open Settings. Look for the Export section, and click <em>Export all workspace content</em>. Choose <em>HTML</em> as the format for the best data compatability.",
"Drag and drop the zip file from Notion's HTML export option, or click to upload": "Drag and drop the zip file from Notion's HTML export option, or click to upload",
"Last active": "Last active",
"Guest": "Guest",
"Shared by": "Shared by",