perf: Remove turndown (#11331)

* Remove turndown

* Refactor htmlToProsemirror

* fix: Bug in CSV import

* refactor
This commit is contained in:
Tom Moor
2026-01-31 20:56:36 -05:00
committed by GitHub
parent 51dd516679
commit bb128318da
22 changed files with 909 additions and 1241 deletions
-3
View File
@@ -75,7 +75,6 @@
"@hocuspocus/extension-throttle": "1.1.2",
"@hocuspocus/provider": "1.1.2",
"@hocuspocus/server": "1.1.2",
"@joplin/turndown-plugin-gfm": "^1.0.49",
"@juggle/resize-observer": "^3.4.0",
"@linear/sdk": "^58.1.0",
"@node-oauth/oauth2-server": "^5.2.0",
@@ -253,7 +252,6 @@
"tiny-cookie": "^2.5.1",
"tmp": "^0.2.5",
"tunnel-agent": "^0.6.0",
"turndown": "^7.2.2",
"ukkonen": "^2.2.0",
"umzug": "^3.8.2",
"utility-types": "^3.11.0",
@@ -341,7 +339,6 @@
"@types/styled-components": "^5.1.32",
"@types/throng": "^5.0.7",
"@types/tmp": "^0.2.6",
"@types/turndown": "^5.0.6",
"@types/utf8": "^3.0.3",
"@types/validator": "^13.15.3",
"@types/yauzl": "^2.10.3",
+29 -101
View File
@@ -1,16 +1,11 @@
import emojiRegex from "emoji-regex";
import mime from "mime-types";
import type { Node } from "prosemirror-model";
import truncate from "lodash/truncate";
import parseTitle from "@shared/utils/parseTitle";
import type { ProsemirrorData } from "@shared/types";
import { ProsemirrorHelper as SharedProsemirrorHelper } from "@shared/utils/ProsemirrorHelper";
import { DocumentValidation } from "@shared/validations";
import { serializer } from "@server/editor";
import { traceFunction } from "@server/logging/tracing";
import type { User } from "@server/models";
import { ProsemirrorHelper } from "@server/models/helpers/ProsemirrorHelper";
import { TextHelper } from "@server/models/helpers/TextHelper";
import type { APIContext } from "@server/types";
import { DocumentConverter } from "@server/utils/DocumentConverter";
import { InvalidRequestError } from "../errors";
@@ -33,14 +28,11 @@ type ImportResult = {
/**
* Converts document content to state and validates size constraints.
*
* @param content The document content as markdown text or Prosemirror JSON.
* @param content The document content as Prosemirror JSON.
* @param title The document title (used in error messages).
* @returns The Y.Doc state buffer.
*/
function convertToState(
content: string | ProsemirrorData,
title: string
): Buffer {
function convertToState(content: ProsemirrorData, title: string): Buffer {
const ydoc = ProsemirrorHelper.toYDoc(content);
const state = ProsemirrorHelper.toState(ydoc);
@@ -53,92 +45,6 @@ function convertToState(
return state;
}
/**
* Imports HTML content directly to Prosemirror, bypassing markdown conversion.
*/
async function importHtml(
content: Buffer | string,
title: string,
user: User,
ctx: APIContext
): Promise<ImportResult> {
let doc: Node = DocumentConverter.htmlToProsemirror(content);
// Extract title from first H1 if present
const headings = SharedProsemirrorHelper.getHeadings(doc);
if (headings.length > 0 && headings[0].level === 1) {
title = headings[0].title;
doc = ProsemirrorHelper.removeFirstHeading(doc);
}
// Extract emoji from start of document
const { emoji: icon, doc: docWithoutEmoji } =
ProsemirrorHelper.extractEmojiFromStart(doc);
doc = docWithoutEmoji;
// Replace external images with attachments
doc = await TextHelper.replaceImagesWithAttachmentsInNode(ctx, doc, user);
const text = serializer.serialize(doc);
title = truncate(title, { length: DocumentValidation.maxTitleLength });
const state = convertToState(doc.toJSON(), title);
return { text, state, title, icon };
}
/**
* Imports content via markdown conversion (for docx, md, csv, etc.).
*/
async function importMarkdown(
content: Buffer | string,
fileName: string,
mimeType: string,
title: string,
user: User,
ctx: APIContext
): Promise<ImportResult> {
let text = await DocumentConverter.convertToMarkdown(
content,
fileName,
mimeType
);
// Find and extract emoji near the beginning of the document
const regex = emojiRegex();
const matches = regex.exec(text.slice(0, 10));
const icon = matches ? matches[0] : undefined;
if (icon) {
text = text.replace(icon, "");
}
// If the first line looks like a markdown heading, use it as the title
if (text.startsWith("# ")) {
const result = parseTitle(text);
title = result.title;
text = text.replace(/^.+(\n|$)/, "");
}
// Replace any <br> generated by turndown with escaped newlines
text = text.trim().replace(/<br>/gi, "\\n");
// Remove any closed and immediately reopened formatting marks
text = text.replace(/\*\*\*\*/gi, "").replace(/____/gi, "");
text = await TextHelper.replaceImagesWithAttachments(ctx, text, user);
// Sanity check text cannot possibly be longer than state
if (text.length > DocumentValidation.maxStateLength) {
throw InvalidRequestError(
`The document "${title}" is too large to import, please reduce the length and try again`
);
}
title = truncate(title, { length: DocumentValidation.maxTitleLength });
const state = convertToState(text, title);
return { text, state, title, icon };
}
async function documentImporter({
mimeType,
fileName,
@@ -154,18 +60,40 @@ async function documentImporter({
"html",
...(mime.extensions[mimeType] ?? []),
];
const title = fileName.replace(
const fileTitle = fileName.replace(
new RegExp(`\\.(${extensions.join("|")})$`, "i"),
""
);
const isHtml = mimeType === "text/html" || fileName.endsWith(".html");
// Convert document using unified converter
const {
doc,
title: extractedTitle,
icon,
} = await DocumentConverter.convert(content, fileName, mimeType);
if (isHtml) {
return importHtml(content, title, user, ctx);
// Use extracted title or fall back to filename
let title = extractedTitle || fileTitle;
// Replace external images with attachments
const processedDoc = await ProsemirrorHelper.replaceImagesWithAttachments(
ctx,
doc,
user
);
// Serialize final text and handle empty documents
let text = serializer.serialize(processedDoc).trim();
// Empty paragraphs serialize to escaped newlines/backslashes, treat as empty
if (/^[\\\s]*$/.test(text)) {
text = "";
}
return importMarkdown(content, fileName, mimeType, title, user, ctx);
// Truncate title and validate size
title = truncate(title, { length: DocumentValidation.maxTitleLength });
const state = convertToState(processedDoc.toJSON() as ProsemirrorData, title);
return { text, state, title, icon };
}
export default traceFunction({
+142 -113
View File
@@ -2,10 +2,14 @@ import { faker } from "@faker-js/faker";
import type { DeepPartial } from "utility-types";
import type { ProsemirrorData } from "@shared/types";
import { MentionType } from "@shared/types";
import { ProsemirrorHelper as SharedProsemirrorHelper } from "@shared/utils/ProsemirrorHelper";
import { createContext } from "@server/context";
import { buildProseMirrorDoc, buildUser } from "@server/test/factories";
import type { MentionAttrs } from "./ProsemirrorHelper";
import { ProsemirrorHelper } from "./ProsemirrorHelper";
jest.mock("@server/storage/files");
describe("ProsemirrorHelper", () => {
describe("processMentions", () => {
it("should handle deleted users", async () => {
@@ -932,141 +936,166 @@ describe("ProsemirrorHelper", () => {
});
});
describe("htmlToProsemirror", () => {
it("should convert basic HTML to Prosemirror", () => {
const html = "<p>Hello world</p>";
describe("replaceImagesWithAttachments", () => {
it("should return the same document when there are no images", async () => {
const user = await buildUser();
const ctx = createContext({ user });
const doc = ProsemirrorHelper.htmlToProsemirror(html);
const doc = buildProseMirrorDoc([
{
type: "paragraph",
content: [{ type: "text", text: "No images here" }],
},
]);
expect(doc.type.name).toBe("doc");
expect(doc.content.childCount).toBe(1);
expect(doc.content.child(0).type.name).toBe("paragraph");
expect(doc.content.child(0).textContent).toBe("Hello world");
const result = await ProsemirrorHelper.replaceImagesWithAttachments(
ctx,
doc,
user
);
expect(result.toJSON()).toEqual(doc.toJSON());
});
it("should convert HTML with heading", () => {
const html = "<h1>Title</h1><p>Content</p>";
it("should correctly identify images in a document", () => {
const doc = buildProseMirrorDoc([
{
type: "paragraph",
content: [
{
type: "image",
attrs: {
src: "https://example.com/image.png",
alt: "Test image",
},
},
],
},
]);
const doc = ProsemirrorHelper.htmlToProsemirror(html);
expect(doc.content.childCount).toBe(2);
expect(doc.content.child(0).type.name).toBe("heading");
expect(doc.content.child(0).attrs.level).toBe(1);
expect(doc.content.child(0).textContent).toBe("Title");
expect(doc.content.child(1).type.name).toBe("paragraph");
const images = SharedProsemirrorHelper.getImages(doc);
expect(images.length).toBe(1);
expect(images[0].attrs.src).toBe("https://example.com/image.png");
expect(images[0].attrs.alt).toBe("Test image");
});
it("should remove script tags", () => {
const html = "<p>Safe content</p><script>alert('xss')</script>";
it("should skip images with invalid URLs", async () => {
const user = await buildUser();
const ctx = createContext({ user });
const doc = ProsemirrorHelper.htmlToProsemirror(html);
const doc = buildProseMirrorDoc([
{
type: "paragraph",
content: [
{
type: "image",
attrs: {
src: "not-a-valid-url",
alt: "Invalid",
},
},
],
},
]);
expect(doc.textContent).toBe("Safe content");
expect(doc.textContent).not.toContain("alert");
const result = await ProsemirrorHelper.replaceImagesWithAttachments(
ctx,
doc,
user
);
// Document should remain unchanged since URL is invalid
expect(result.toJSON()).toEqual(doc.toJSON());
});
it("should remove style tags", () => {
const html = "<style>body { color: red; }</style><p>Content</p>";
it("should skip images with internal URLs", async () => {
const user = await buildUser();
const ctx = createContext({ user });
const doc = ProsemirrorHelper.htmlToProsemirror(html);
const doc = buildProseMirrorDoc([
{
type: "paragraph",
content: [
{
type: "image",
attrs: {
src: "/api/attachments.redirect?id=existing-id",
alt: "Internal",
},
},
],
},
]);
expect(doc.textContent).toBe("Content");
expect(doc.textContent).not.toContain("color");
const result = await ProsemirrorHelper.replaceImagesWithAttachments(
ctx,
doc,
user
);
// Document should remain unchanged since URL is internal
expect(result.toJSON()).toEqual(doc.toJSON());
});
it("should handle Buffer input", () => {
const html = Buffer.from("<p>From buffer</p>", "utf8");
it("should handle document with multiple node types", async () => {
const user = await buildUser();
const ctx = createContext({ user });
const doc = ProsemirrorHelper.htmlToProsemirror(html);
const doc = buildProseMirrorDoc([
{
type: "heading",
attrs: { level: 1 },
content: [{ type: "text", text: "Title" }],
},
{
type: "paragraph",
content: [{ type: "text", text: "Some text" }],
},
{
type: "paragraph",
content: [
{
type: "image",
attrs: {
src: "invalid-url",
alt: "Image",
},
},
],
},
]);
expect(doc.content.child(0).textContent).toBe("From buffer");
const result = await ProsemirrorHelper.replaceImagesWithAttachments(
ctx,
doc,
user
);
// Document structure should be preserved
expect(result.content.childCount).toBe(3);
expect(result.content.child(0).type.name).toBe("heading");
expect(result.content.child(1).type.name).toBe("paragraph");
expect(result.content.child(2).type.name).toBe("paragraph");
});
it("should convert HTML with lists", () => {
const html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
it("should handle empty document", async () => {
const user = await buildUser();
const ctx = createContext({ user });
const doc = ProsemirrorHelper.htmlToProsemirror(html);
const doc = buildProseMirrorDoc([
{
type: "paragraph",
content: [],
},
]);
expect(doc.content.childCount).toBe(1);
expect(doc.content.child(0).type.name).toBe("bullet_list");
expect(doc.content.child(0).content.childCount).toBe(2);
});
const result = await ProsemirrorHelper.replaceImagesWithAttachments(
ctx,
doc,
user
);
it("should convert HTML with bold and italic", () => {
const html = "<p><strong>Bold</strong> and <em>italic</em></p>";
const doc = ProsemirrorHelper.htmlToProsemirror(html);
const paragraph = doc.content.child(0);
expect(paragraph.type.name).toBe("paragraph");
// Check that marks are applied
const boldText = paragraph.content.child(0);
expect(boldText.text).toBe("Bold");
expect(boldText.marks.some((m) => m.type.name === "strong")).toBe(true);
const italicText = paragraph.content.child(2);
expect(italicText.text).toBe("italic");
expect(italicText.marks.some((m) => m.type.name === "em")).toBe(true);
});
it("should handle full HTML document", () => {
const html = `
<!DOCTYPE html>
<html>
<head>
<title>Test</title>
<meta charset="utf-8">
</head>
<body>
<h1>Document Title</h1>
<p>Paragraph content</p>
</body>
</html>
`;
const doc = ProsemirrorHelper.htmlToProsemirror(html);
expect(doc.content.childCount).toBe(2);
expect(doc.content.child(0).type.name).toBe("heading");
expect(doc.content.child(0).textContent).toBe("Document Title");
expect(doc.content.child(1).type.name).toBe("paragraph");
expect(doc.content.child(1).textContent).toBe("Paragraph content");
});
it("should remove emoticon images", () => {
const html = `<p>Hello <img class="emoticon" src="smile.png" alt=":)"> world</p>`;
const doc = ProsemirrorHelper.htmlToProsemirror(html);
// Emoticon image should be removed, text content remains
expect(doc.textContent).not.toContain(":)");
expect(doc.textContent).toContain("Hello");
expect(doc.textContent).toContain("world");
});
it("should remove Jira icon images", () => {
const html = `
<p>Issue: <span class="jira-issue-key"><img class="icon" src="icon.png">ABC-123</span></p>
`;
const doc = ProsemirrorHelper.htmlToProsemirror(html);
expect(doc.textContent).toBe("Issue: ABC-123");
});
it("should apply Confluence image sizing", () => {
const html = `
<p><img src="image.png" data-width="800" data-height="600" width="400"></p>
`;
const doc = ProsemirrorHelper.htmlToProsemirror(html);
const paragraph = doc.content.child(0);
const image = paragraph.content.child(0);
expect(image.type.name).toBe("image");
expect(image.attrs.width).toBe(400);
expect(image.attrs.height).toBe(300);
expect(result.toJSON()).toEqual(doc.toJSON());
});
});
});
+118 -84
View File
@@ -1,16 +1,13 @@
import emojiRegex from "emoji-regex";
import { JSDOM } from "jsdom";
import chunk from "lodash/chunk";
import compact from "lodash/compact";
import { EditorState } from "prosemirror-state";
import { EditorView } from "prosemirror-view";
import flatten from "lodash/flatten";
import isMatch from "lodash/isMatch";
import uniq from "lodash/uniq";
import {
Node,
Fragment,
DOMParser as ProsemirrorDOMParser,
} from "prosemirror-model";
import { Node, Fragment } from "prosemirror-model";
import { renderToString } from "react-dom/server";
import styled, { ServerStyleSheet, ThemeProvider } from "styled-components";
import { prosemirrorToYDoc } from "y-prosemirror";
@@ -22,17 +19,23 @@ import EditorContainer from "@shared/editor/components/Styles";
import GlobalStyles from "@shared/styles/globals";
import light from "@shared/styles/theme";
import type { ProsemirrorData, UnfurlResponse } from "@shared/types";
import { MentionType } from "@shared/types";
import { attachmentRedirectRegex } from "@shared/utils/ProsemirrorHelper";
import { AttachmentPreset, MentionType } from "@shared/types";
import {
attachmentRedirectRegex,
ProsemirrorHelper as SharedProsemirrorHelper,
} from "@shared/utils/ProsemirrorHelper";
import parseDocumentSlug from "@shared/utils/parseDocumentSlug";
import { isRTL } from "@shared/utils/rtl";
import { isInternalUrl } from "@shared/utils/urls";
import attachmentCreator from "@server/commands/attachmentCreator";
import { plugins, schema, parser } from "@server/editor";
import env from "@server/env";
import Logger from "@server/logging/Logger";
import { trace } from "@server/logging/tracing";
import Attachment from "@server/models/Attachment";
import User from "@server/models/User";
import FileStorage from "@server/storage/files";
import type { APIContext } from "@server/types";
export type HTMLOptions = {
/** A title, if it should be included */
@@ -798,88 +801,14 @@ export class ProsemirrorHelper {
};
}
/**
* Convert HTML content directly to a Prosemirror document node.
*
* @param content The HTML content as a string or Buffer.
* @returns A Prosemirror Node representing the document.
*/
public static htmlToProsemirror(content: Buffer | string): Node {
if (typeof content !== "string") {
content = content.toString("utf8");
}
const dom = new JSDOM(content);
const document = dom.window.document;
// Remove problematic elements before parsing
const elementsToRemove = document.querySelectorAll(
"script, style, title, head, meta, link"
);
elementsToRemove.forEach((el) => el.remove());
// Preprocess the DOM to handle cases that turndown plugins handled
this.preprocessHtmlForImport(document);
// Patch global environment for Prosemirror DOMParser
const cleanup = this.patchGlobalEnv(dom.window);
try {
const domParser = ProsemirrorDOMParser.fromSchema(schema);
return domParser.parse(document.body);
} finally {
cleanup();
}
}
/**
* Preprocesses HTML DOM before Prosemirror parsing to cleanup
* images and other elements.
*
* @param document The DOM document to preprocess.
*/
private static preprocessHtmlForImport(document: Document): void {
// Handle images: filter emoticons, remove Jira icons, apply Confluence sizing
const images = document.querySelectorAll("img");
images.forEach((img) => {
const className = img.className || "";
// Skip emoticon images (they'll be dropped)
if (className.includes("emoticon")) {
img.remove();
return;
}
// Remove Jira icon images
if (
className === "icon" &&
img.parentElement?.className.includes("jira-issue-key")
) {
img.remove();
return;
}
// Handle Confluence image sizing: data-width/data-height → width/height
const dataWidth = img.getAttribute("data-width");
const dataHeight = img.getAttribute("data-height");
const width = img.getAttribute("width");
if (dataWidth && dataHeight && width) {
const ratio = parseInt(dataWidth) / parseInt(width);
const calculatedHeight = Math.round(parseInt(dataHeight) / ratio);
img.setAttribute("height", String(calculatedHeight));
}
});
}
/**
* Patches the global environment with properties from the JSDOM window,
* necessary for ProseMirror to run in a Node environment.
*
* @param domWindow The JSDOM window object
* @returns A cleanup function to restore the global environment
* @param domWindow The JSDOM window object.
* @returns A cleanup function to restore the global environment.
*/
private static patchGlobalEnv(domWindow: JSDOM["window"]) {
public static patchGlobalEnv(domWindow: JSDOM["window"]) {
const g = global as any;
const globalParams = {
@@ -922,4 +851,109 @@ export class ProsemirrorHelper {
});
};
}
/**
* Replaces remote and base64 encoded images in the given Prosemirror node
* with attachment urls and uploads the images to the storage provider.
*
* @param ctx The API context.
* @param doc The Prosemirror node to process.
* @param user The user context.
* @returns A new Prosemirror node with images replaced.
*/
static async replaceImagesWithAttachments(
ctx: APIContext,
doc: Node,
user: User
): Promise<Node> {
const images = SharedProsemirrorHelper.getImages(doc);
const videos = SharedProsemirrorHelper.getVideos(doc);
const nodes = [...images, ...videos];
if (!nodes.length) {
return doc;
}
const timeoutPerImage = Math.floor(
Math.min(env.REQUEST_TIMEOUT / nodes.length, 10000)
);
const urlToAttachment: Map<string, Attachment> = new Map();
const chunks = chunk(nodes, 10);
for (const nodeChunk of chunks) {
await Promise.all(
nodeChunk.map(async (node) => {
const src = String(node.attrs.src ?? "");
// Skip invalid URLs
try {
new URL(src);
} catch {
return;
}
// Skip internal URLs
if (isInternalUrl(src)) {
return;
}
// Skip already processed
if (urlToAttachment.has(src)) {
return;
}
try {
const attachment = await attachmentCreator({
name: String(node.attrs.alt ?? node.type.name),
url: src,
preset: AttachmentPreset.DocumentAttachment,
user,
fetchOptions: {
timeout: timeoutPerImage,
},
ctx,
});
if (attachment) {
urlToAttachment.set(src, attachment);
}
} catch (err) {
Logger.warn("Failed to download image for attachment", {
error: err.message,
src,
});
}
})
);
}
// Transform the document to replace image/video src attributes
const transformFragment = (fragment: Fragment): Fragment => {
const transformedNodes: Node[] = [];
fragment.forEach((node) => {
if (node.type.name === "image" || node.type.name === "video") {
const src = String(node.attrs.src ?? "");
const attachment = urlToAttachment.get(src);
if (attachment) {
const json = node.toJSON();
json.attrs = { ...json.attrs, src: attachment.redirectUrl };
transformedNodes.push(Node.fromJSON(schema, json));
} else {
transformedNodes.push(node);
}
} else if (node.content.size > 0) {
transformedNodes.push(node.copy(transformFragment(node.content)));
} else {
transformedNodes.push(node);
}
});
return Fragment.fromArray(transformedNodes);
};
return doc.copy(transformFragment(doc.content));
}
}
+8 -8
View File
@@ -1,12 +1,12 @@
import { ProsemirrorHelper as SharedProsemirrorHelper } from "@shared/utils/ProsemirrorHelper";
import { createContext } from "@server/context";
import { buildProseMirrorDoc, buildUser } from "@server/test/factories";
import { TextHelper } from "./TextHelper";
import { ProsemirrorHelper } from "./ProsemirrorHelper";
jest.mock("@server/storage/files");
describe("TextHelper", () => {
describe("replaceImagesWithAttachmentsInNode", () => {
describe("ProsemirrorHelper", () => {
describe("replaceImagesWithAttachments", () => {
it("should return the same document when there are no images", async () => {
const user = await buildUser();
const ctx = createContext({ user });
@@ -18,7 +18,7 @@ describe("TextHelper", () => {
},
]);
const result = await TextHelper.replaceImagesWithAttachmentsInNode(
const result = await ProsemirrorHelper.replaceImagesWithAttachments(
ctx,
doc,
user
@@ -68,7 +68,7 @@ describe("TextHelper", () => {
},
]);
const result = await TextHelper.replaceImagesWithAttachmentsInNode(
const result = await ProsemirrorHelper.replaceImagesWithAttachments(
ctx,
doc,
user
@@ -97,7 +97,7 @@ describe("TextHelper", () => {
},
]);
const result = await TextHelper.replaceImagesWithAttachmentsInNode(
const result = await ProsemirrorHelper.replaceImagesWithAttachments(
ctx,
doc,
user
@@ -135,7 +135,7 @@ describe("TextHelper", () => {
},
]);
const result = await TextHelper.replaceImagesWithAttachmentsInNode(
const result = await ProsemirrorHelper.replaceImagesWithAttachments(
ctx,
doc,
user
@@ -159,7 +159,7 @@ describe("TextHelper", () => {
},
]);
const result = await TextHelper.replaceImagesWithAttachmentsInNode(
const result = await ProsemirrorHelper.replaceImagesWithAttachments(
ctx,
doc,
user
+1 -109
View File
@@ -1,10 +1,8 @@
import chunk from "lodash/chunk";
import escapeRegExp from "lodash/escapeRegExp";
import { Fragment, Node } from "prosemirror-model";
import { AttachmentPreset } from "@shared/types";
import { ProsemirrorHelper as SharedProsemirrorHelper } from "@shared/utils/ProsemirrorHelper";
import { isInternalUrl } from "@shared/utils/urls";
import attachmentCreator from "@server/commands/attachmentCreator";
import { schema } from "@server/editor";
import env from "@server/env";
import Logger from "@server/logging/Logger";
import { trace } from "@server/logging/tracing";
@@ -14,7 +12,6 @@ import FileStorage from "@server/storage/files";
import type { APIContext } from "@server/types";
import parseAttachmentIds from "@server/utils/parseAttachmentIds";
import parseImages from "@server/utils/parseImages";
import { isInternalUrl } from "@shared/utils/urls";
@trace()
export class TextHelper {
@@ -131,109 +128,4 @@ export class TextHelper {
return output;
}
/**
* Replaces remote and base64 encoded images in the given Prosemirror node
* with attachment urls and uploads the images to the storage provider.
*
* @param ctx The API context.
* @param doc The Prosemirror node to process.
* @param user The user context.
* @returns A new Prosemirror node with images replaced.
*/
static async replaceImagesWithAttachmentsInNode(
ctx: APIContext,
doc: Node,
user: User
): Promise<Node> {
const images = SharedProsemirrorHelper.getImages(doc);
const videos = SharedProsemirrorHelper.getVideos(doc);
const nodes = [...images, ...videos];
if (!nodes.length) {
return doc;
}
const timeoutPerImage = Math.floor(
Math.min(env.REQUEST_TIMEOUT / nodes.length, 10000)
);
const urlToAttachment: Map<string, Attachment> = new Map();
const chunks = chunk(nodes, 10);
for (const nodeChunk of chunks) {
await Promise.all(
nodeChunk.map(async (node) => {
const src = String(node.attrs.src ?? "");
// Skip invalid URLs
try {
new URL(src);
} catch {
return;
}
// Skip internal URLs
if (isInternalUrl(src)) {
return;
}
// Skip already processed
if (urlToAttachment.has(src)) {
return;
}
try {
const attachment = await attachmentCreator({
name: String(node.attrs.alt ?? node.type.name),
url: src,
preset: AttachmentPreset.DocumentAttachment,
user,
fetchOptions: {
timeout: timeoutPerImage,
},
ctx,
});
if (attachment) {
urlToAttachment.set(src, attachment);
}
} catch (err) {
Logger.warn("Failed to download image for attachment", {
error: err.message,
src,
});
}
})
);
}
// Transform the document to replace image/video src attributes
const transformFragment = (fragment: Fragment): Fragment => {
const transformedNodes: Node[] = [];
fragment.forEach((node) => {
if (node.type.name === "image" || node.type.name === "video") {
const src = String(node.attrs.src ?? "");
const attachment = urlToAttachment.get(src);
if (attachment) {
const json = node.toJSON();
json.attrs = { ...json.attrs, src: attachment.redirectUrl };
transformedNodes.push(Node.fromJSON(schema, json));
} else {
transformedNodes.push(node);
}
} else if (node.content.size > 0) {
transformedNodes.push(node.copy(transformFragment(node.content)));
} else {
transformedNodes.push(node);
}
});
return Fragment.fromArray(transformedNodes);
};
return doc.copy(transformFragment(doc.content));
}
}
-9
View File
@@ -11,15 +11,6 @@ declare module "email-providers" {
export default list;
}
declare module "@joplin/turndown-plugin-gfm" {
import { Plugin } from "turndown";
export const strikethrough: Plugin;
export const tables: Plugin;
export const taskListItems: Plugin;
export const gfm: Plugin;
}
declare module "ukkonen" {
export default function ukkonen(
first: string,
+278 -18
View File
@@ -1,31 +1,291 @@
import { DocumentConverter } from "./DocumentConverter";
describe("csvToMarkdown", () => {
it("should convert csv to markdown with comma", async () => {
const csv = `name,age
describe("DocumentConverter", () => {
describe("convert", () => {
describe("csv", () => {
it("should convert csv to markdown table", async () => {
const csv = `name,age
John,25
Jane,24`;
const markdown = `| name | age |
| --- | --- |
| John | 25 |
| Jane | 24 |
`;
const result = await DocumentConverter.convert(
csv,
"test.csv",
"text/csv"
);
expect(await DocumentConverter.csvToMarkdown(csv)).toEqual(markdown);
});
// CSV is converted to a markdown table
expect(result.text).toContain("| name | age |");
expect(result.text).toContain("John");
expect(result.text).toContain("Jane");
expect(result.title).toEqual("");
});
it("should convert csv to markdown with semicolon", async () => {
const csv = `name;age
it("should handle csv with semicolon delimiter", async () => {
const csv = `name;age
John;25
"Joan ""the bone"", Anne";24`;
const markdown = `| name | age |
| --- | --- |
| John | 25 |
| Joan "the bone", Anne | 24 |
`;
const result = await DocumentConverter.convert(
csv,
"test.csv",
"text/csv"
);
expect(await DocumentConverter.csvToMarkdown(csv)).toEqual(markdown);
expect(result.text).toContain("| name | age |");
expect(result.text).toContain("John");
expect(result.text).toContain('Joan "the bone", Anne');
});
it("should handle csv with title row before headers", async () => {
// Some financial exports have a title row before the actual headers
const csv = `"Report for Account"
"Symbol","Name","Value",
"ABC","Test Corp","$100",
"XYZ","Other Inc","$200",`;
const result = await DocumentConverter.convert(
csv,
"test.csv",
"text/csv"
);
// The actual data headers should be used, not the title row
expect(result.text).toContain("| Symbol | Name | Value |");
expect(result.text).toContain("ABC");
expect(result.text).toContain("Test Corp");
expect(result.text).toContain("XYZ");
});
it("should handle csv with trailing comma on each line", async () => {
const csv = `name,age,city,
John,25,NYC,
Jane,24,LA,`;
const result = await DocumentConverter.convert(
csv,
"test.csv",
"text/csv"
);
expect(result.text).toContain("| name | age | city |");
expect(result.text).toContain("John");
expect(result.text).toContain("Jane");
// Should not have trailing empty column
expect(result.text).not.toContain("| city | |");
expect(result.text).not.toContain("| city | |");
});
it("should preserve intentionally empty cells at end of rows", async () => {
const csv = `name,age,city
John,25,NYC
Jane,24,`;
const result = await DocumentConverter.convert(
csv,
"test.csv",
"text/csv"
);
expect(result.text).toContain("| name | age | city |");
expect(result.text).toContain("John");
expect(result.text).toContain("NYC");
// Jane's row should have 3 columns (empty city preserved)
expect(result.text).toMatch(/\| Jane \| 24\s*\|\s*\|/);
});
});
describe("html", () => {
it("should extract title from H1", async () => {
const html = "<h1>My Title</h1><p>Content here</p>";
const result = await DocumentConverter.convert(
html,
"test.html",
"text/html"
);
expect(result.title).toEqual("My Title");
expect(result.text).toContain("Content here");
expect(result.text).not.toContain("My Title");
});
it("should extract emoji from start", async () => {
const html = "<p>🚀 Launch content</p>";
const result = await DocumentConverter.convert(
html,
"test.html",
"text/html"
);
expect(result.icon).toEqual("🚀");
expect(result.text).not.toMatch(/^🚀/);
});
});
describe("markdown", () => {
it("should extract title from H1", async () => {
const md = "# My Title\n\nContent here";
const result = await DocumentConverter.convert(
md,
"test.md",
"text/markdown"
);
expect(result.title).toEqual("My Title");
expect(result.text).toContain("Content here");
expect(result.text).not.toContain("My Title");
});
it("should return empty title when no H1", async () => {
const md = "## Subtitle\n\nContent here";
const result = await DocumentConverter.convert(
md,
"test.md",
"text/markdown"
);
expect(result.title).toEqual("");
expect(result.text).toContain("Subtitle");
});
});
});
describe("htmlToProsemirror", () => {
it("should convert basic HTML to Prosemirror", () => {
const html = "<p>Hello world</p>";
const doc = DocumentConverter.htmlToProsemirror(html);
expect(doc.type.name).toBe("doc");
expect(doc.content.childCount).toBe(1);
expect(doc.content.child(0).type.name).toBe("paragraph");
expect(doc.content.child(0).textContent).toBe("Hello world");
});
it("should convert HTML with heading", () => {
const html = "<h1>Title</h1><p>Content</p>";
const doc = DocumentConverter.htmlToProsemirror(html);
expect(doc.content.childCount).toBe(2);
expect(doc.content.child(0).type.name).toBe("heading");
expect(doc.content.child(0).attrs.level).toBe(1);
expect(doc.content.child(0).textContent).toBe("Title");
expect(doc.content.child(1).type.name).toBe("paragraph");
});
it("should remove script tags", () => {
const html = "<p>Safe content</p><script>alert('xss')</script>";
const doc = DocumentConverter.htmlToProsemirror(html);
expect(doc.textContent).toBe("Safe content");
expect(doc.textContent).not.toContain("alert");
});
it("should remove style tags", () => {
const html = "<style>body { color: red; }</style><p>Content</p>";
const doc = DocumentConverter.htmlToProsemirror(html);
expect(doc.textContent).toBe("Content");
expect(doc.textContent).not.toContain("color");
});
it("should handle Buffer input", () => {
const html = Buffer.from("<p>From buffer</p>", "utf8");
const doc = DocumentConverter.htmlToProsemirror(html);
expect(doc.content.child(0).textContent).toBe("From buffer");
});
it("should convert HTML with lists", () => {
const html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
const doc = DocumentConverter.htmlToProsemirror(html);
expect(doc.content.childCount).toBe(1);
expect(doc.content.child(0).type.name).toBe("bullet_list");
expect(doc.content.child(0).content.childCount).toBe(2);
});
it("should convert HTML with bold and italic", () => {
const html = "<p><strong>Bold</strong> and <em>italic</em></p>";
const doc = DocumentConverter.htmlToProsemirror(html);
const paragraph = doc.content.child(0);
expect(paragraph.type.name).toBe("paragraph");
// Check that marks are applied
const boldText = paragraph.content.child(0);
expect(boldText.text).toBe("Bold");
expect(boldText.marks.some((m) => m.type.name === "strong")).toBe(true);
const italicText = paragraph.content.child(2);
expect(italicText.text).toBe("italic");
expect(italicText.marks.some((m) => m.type.name === "em")).toBe(true);
});
it("should handle full HTML document", () => {
const html = `
<!DOCTYPE html>
<html>
<head>
<title>Test</title>
<meta charset="utf-8">
</head>
<body>
<h1>Document Title</h1>
<p>Paragraph content</p>
</body>
</html>
`;
const doc = DocumentConverter.htmlToProsemirror(html);
expect(doc.content.childCount).toBe(2);
expect(doc.content.child(0).type.name).toBe("heading");
expect(doc.content.child(0).textContent).toBe("Document Title");
expect(doc.content.child(1).type.name).toBe("paragraph");
expect(doc.content.child(1).textContent).toBe("Paragraph content");
});
it("should remove emoticon images", () => {
const html = `<p>Hello <img class="emoticon" src="smile.png" alt=":)"> world</p>`;
const doc = DocumentConverter.htmlToProsemirror(html);
// Emoticon image should be removed, text content remains
expect(doc.textContent).not.toContain(":)");
expect(doc.textContent).toContain("Hello");
expect(doc.textContent).toContain("world");
});
it("should remove Jira icon images", () => {
const html = `
<p>Issue: <span class="jira-issue-key"><img class="icon" src="icon.png">ABC-123</span></p>
`;
const doc = DocumentConverter.htmlToProsemirror(html);
expect(doc.textContent).toBe("Issue: ABC-123");
});
it("should apply Confluence image sizing", () => {
const html = `
<p><img src="image.png" data-width="800" data-height="600" width="400"></p>
`;
const doc = DocumentConverter.htmlToProsemirror(html);
const paragraph = doc.content.child(0);
const image = paragraph.content.child(0);
expect(image.type.name).toBe("image");
expect(image.attrs.width).toBe(400);
expect(image.attrs.height).toBe(300);
});
});
});
+333 -117
View File
@@ -1,89 +1,79 @@
import { parse } from "@fast-csv/parse";
import { JSDOM } from "jsdom";
import escapeRegExp from "lodash/escapeRegExp";
import { simpleParser } from "mailparser";
import mammoth from "mammoth";
import type { Node } from "prosemirror-model";
import { DOMParser as ProsemirrorDOMParser } from "prosemirror-model";
import { ProsemirrorHelper as SharedProsemirrorHelper } from "@shared/utils/ProsemirrorHelper";
import { schema, serializer } from "@server/editor";
import { FileImportError } from "@server/errors";
import { trace, traceFunction } from "@server/logging/tracing";
import { ProsemirrorHelper } from "@server/models/helpers/ProsemirrorHelper";
import turndownService from "@server/utils/turndown";
export interface ConvertResult {
/** The document content as markdown text. */
text: string;
/** The document content as Prosemirror. */
doc: Node;
/** The extracted title (from H1 heading if present). */
title: string;
/** The extracted emoji/icon from start of document. */
icon?: string;
}
@trace()
export class DocumentConverter {
/**
* Convert an incoming file to markdown.
* Convert an incoming file to a structured document result.
*
* @param content The content of the file.
* @param fileName The name of the file, including extension.
* @param mimeType The mime type of the file.
* @returns The markdown representation of the file.
* @returns The converted document with text, data, title, and icon.
*/
public static async convertToMarkdown(
public static async convert(
content: Buffer | string,
fileName: string,
mimeType: string
) {
return (
await this.internalConvertToMarkdown(content, fileName, mimeType)
).trim();
}
): Promise<ConvertResult> {
let doc: Node;
private static async internalConvertToMarkdown(
content: Buffer | string,
fileName: string,
mimeType: string
) {
// First try to convert the file based on the mime type.
switch (mimeType) {
case "application/msword":
return this.confluenceToMarkdown(content);
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return this.docXToMarkdown(content);
case "text/html":
return this.htmlToMarkdown(content);
case "text/plain":
case "text/markdown":
return this.fileToMarkdown(content);
case "text/csv":
return this.csvToMarkdown(content);
default:
break;
// Route to appropriate conversion method
const html = await this.convertToHtml(content, fileName, mimeType);
if (html !== undefined) {
doc = this.htmlToProsemirror(html);
} else {
const markdown = await this.convertToMarkdown(
content,
fileName,
mimeType
);
doc = ProsemirrorHelper.toProsemirror(markdown);
}
// If the mime type doesn't work, try to convert based on the file extension.
const extension = fileName.split(".").pop();
switch (extension) {
case "docx":
return this.docXToMarkdown(content);
case "html":
return this.htmlToMarkdown(content);
case "md":
case "markdown":
return this.fileToMarkdown(content);
default:
throw FileImportError(`File type ${mimeType} not supported`);
}
}
public static async docXToMarkdown(content: Buffer | string) {
if (content instanceof Buffer) {
const { value } = await traceFunction({ spanName: "convertToHtml" })(
mammoth.convertToHtml
)({
buffer: content,
});
return turndownService.turndown(value);
// Extract title from first H1 heading
let title = "";
const headings = SharedProsemirrorHelper.getHeadings(doc);
if (headings.length > 0 && headings[0].level === 1) {
title = headings[0].title;
doc = ProsemirrorHelper.removeFirstHeading(doc);
}
throw FileImportError("Unsupported Word file");
}
// Extract emoji from start of document
const { emoji: icon, doc: docWithoutEmoji } =
ProsemirrorHelper.extractEmojiFromStart(doc);
doc = docWithoutEmoji;
public static async htmlToMarkdown(content: Buffer | string) {
if (typeof content !== "string") {
content = content.toString("utf8");
}
// Serialize to markdown and trim whitespace
const text = serializer.serialize(doc).trim();
return turndownService.turndown(content);
return {
text,
doc,
title,
icon,
};
}
/**
@@ -93,19 +83,231 @@ export class DocumentConverter {
* @returns A Prosemirror Node representing the document.
*/
public static htmlToProsemirror(content: Buffer | string): Node {
return ProsemirrorHelper.htmlToProsemirror(content);
if (typeof content !== "string") {
content = content.toString("utf8");
}
const dom = new JSDOM(content);
const document = dom.window.document;
// Remove problematic elements before parsing
const elementsToRemove = document.querySelectorAll(
"script, style, title, head, meta, link"
);
elementsToRemove.forEach((el) => el.remove());
// Preprocess the DOM to handle edge cases
this.preprocessHtmlForImport(document);
// Patch global environment for Prosemirror DOMParser
const cleanup = ProsemirrorHelper.patchGlobalEnv(dom.window);
try {
const domParser = ProsemirrorDOMParser.fromSchema(schema);
return domParser.parse(document.body);
} finally {
cleanup();
}
}
public static csvToMarkdown(content: Buffer | string): Promise<string> {
/**
* Preprocesses HTML DOM before Prosemirror parsing to cleanup
* images and other elements.
*
* @param document The DOM document to preprocess.
*/
private static preprocessHtmlForImport(document: Document): void {
// Handle images: filter emoticons, remove Jira icons, apply Confluence sizing
const images = document.querySelectorAll("img");
images.forEach((img) => {
const className = img.className || "";
// Skip emoticon images (they'll be dropped)
if (className.includes("emoticon")) {
img.remove();
return;
}
// Remove Jira icon images
if (
className === "icon" &&
img.parentElement?.className.includes("jira-issue-key")
) {
img.remove();
return;
}
// Handle Confluence image sizing: data-width/data-height → width/height
const dataWidth = img.getAttribute("data-width");
const dataHeight = img.getAttribute("data-height");
const width = img.getAttribute("width");
if (dataWidth && dataHeight && width) {
const ratio = parseInt(dataWidth) / parseInt(width);
const calculatedHeight = Math.round(parseInt(dataHeight) / ratio);
img.setAttribute("height", String(calculatedHeight));
}
});
}
/**
* Attempts to convert content to HTML for formats that support it.
* Returns undefined for formats that should be parsed as markdown directly.
*
* @param content The content of the file.
* @param fileName The name of the file, including extension.
* @param mimeType The mime type of the file.
* @returns HTML string if convertible, undefined otherwise.
*/
private static async convertToHtml(
content: Buffer | string,
fileName: string,
mimeType: string
): Promise<string | undefined> {
// First try to convert based on the mime type
switch (mimeType) {
case "text/html":
return typeof content === "string" ? content : content.toString("utf8");
case "application/msword":
return this.confluenceToHtml(content);
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return this.docxToHtml(content);
default:
break;
}
// Try to convert based on the file extension
const extension = fileName.split(".").pop();
switch (extension) {
case "html":
return typeof content === "string" ? content : content.toString("utf8");
case "docx":
return this.docxToHtml(content);
default:
return undefined;
}
}
/**
* Converts content to markdown for text-based formats.
*
* @param content The content of the file.
* @param fileName The name of the file, including extension.
* @param mimeType The mime type of the file.
* @returns Markdown string.
*/
private static async convertToMarkdown(
content: Buffer | string,
fileName: string,
mimeType: string
): Promise<string> {
switch (mimeType) {
case "text/plain":
case "text/markdown":
return this.bufferToString(content);
case "text/csv":
return this.csvToMarkdown(content);
default:
break;
}
const extension = fileName.split(".").pop();
switch (extension) {
case "md":
case "markdown":
return this.bufferToString(content);
default:
throw FileImportError(`File type ${mimeType} not supported`);
}
}
/**
* Convert a docx file to HTML using mammoth.
*
* @param content The docx file content as a Buffer.
* @returns The HTML representation of the document.
*/
private static async docxToHtml(content: Buffer | string): Promise<string> {
if (content instanceof Buffer) {
const { value } = await traceFunction({ spanName: "convertToHtml" })(
mammoth.convertToHtml
)({
buffer: content,
});
return value;
}
throw FileImportError("Unsupported Word file");
}
/**
* Convert a Confluence Word export to HTML.
*
* @param content The Confluence Word export content.
* @returns The HTML representation of the document.
*/
private static async confluenceToHtml(
content: Buffer | string
): Promise<string> {
if (typeof content !== "string") {
content = content.toString("utf8");
}
// We're only supporting the output from Confluence here, regular Word documents should call
// into the docxToHtml importer. See: https://jira.atlassian.com/browse/CONFSERVER-38237
if (!content.includes("Content-Type: multipart/related")) {
throw FileImportError("Unsupported Word file");
}
// Confluence "Word" documents are actually just multi-part email messages, so we can use
// mailparser to parse the content.
const parsed = await simpleParser(content);
if (!parsed.html) {
throw FileImportError("Unsupported Word file (No content found)");
}
let html = parsed.html;
// Replace the content-location with a data URI for each attachment.
for (const attachment of parsed.attachments) {
const contentLocation = String(
attachment.headers.get("content-location") ?? ""
);
const id = contentLocation.split("/").pop();
if (!id) {
continue;
}
html = html.replace(
new RegExp(escapeRegExp(id), "g"),
`data:image/png;base64,${attachment.content.toString("base64")}`
);
}
return html;
}
/**
* Convert a CSV file to a markdown table.
*
* @param content The CSV file content.
* @returns A markdown table representation.
*/
private static csvToMarkdown(content: Buffer | string): Promise<string> {
return new Promise((resolve, reject) => {
const text = this.fileToMarkdown(content).trim();
const firstLine = text.split("\n")[0];
const text = this.bufferToString(content).trim();
const textLines = text.split("\n");
// Find the first non-empty line to determine the delimiter
const firstNonEmptyLine =
textLines.find((line) => line.trim().length > 0) || "";
// Determine the separator used in the CSV file based on number of occurrences of each separator on first line
const delimiter = [";", ",", "\t"].reduce(
(acc, separator) => {
const count = (
firstLine.match(new RegExp(escapeRegExp(separator), "g")) || []
firstNonEmptyLine.match(new RegExp(escapeRegExp(separator), "g")) ||
[]
).length;
return count > acc.count ? { count, separator } : acc;
},
@@ -121,9 +323,64 @@ export class DocumentConverter {
})
.on("data", (row) => lines.push(row))
.on("end", () => {
const headers = lines[0];
const table = lines
.slice(1)
// Filter out completely empty rows
const nonEmptyLines = lines.filter((row) =>
row.some((cell) => cell.trim() !== "")
);
if (nonEmptyLines.length === 0) {
resolve("");
return;
}
// Check if all rows have a trailing empty cell (trailing comma artifact)
// Only trim if ALL non-empty rows end with an empty cell
let trimmedLines = nonEmptyLines;
while (
trimmedLines.length > 0 &&
trimmedLines.every(
(row) => row.length > 0 && row[row.length - 1].trim() === ""
)
) {
trimmedLines = trimmedLines.map((row) => row.slice(0, -1));
}
// Find the most common column count
const columnCounts = new Map<number, number>();
for (const row of trimmedLines) {
if (row.length > 0) {
columnCounts.set(
row.length,
(columnCounts.get(row.length) || 0) + 1
);
}
}
// Get the column count that appears most frequently
let expectedColumns = 0;
let maxFrequency = 0;
for (const [count, frequency] of columnCounts) {
if (frequency > maxFrequency) {
maxFrequency = frequency;
expectedColumns = count;
}
}
// Find the first row with the expected column count (this is the header)
const headerIndex = trimmedLines.findIndex(
(row) => row.length === expectedColumns
);
if (headerIndex === -1) {
resolve("");
return;
}
const headers = trimmedLines[headerIndex];
const dataRows = trimmedLines
.slice(headerIndex + 1)
.filter((row) => row.length === expectedColumns);
const table = dataRows
.map((cells) => `| ${cells.join(" | ")} |`)
.join("\n");
@@ -138,54 +395,13 @@ export class DocumentConverter {
});
}
public static fileToMarkdown(content: Buffer | string) {
if (typeof content !== "string") {
content = content.toString("utf8");
}
return content;
}
public static async confluenceToMarkdown(content: Buffer | string) {
if (typeof content !== "string") {
content = content.toString("utf8");
}
// We're only supporting the output from Confluence here, regular Word documents should call
// into the docxToMarkdown importer. See: https://jira.atlassian.com/browse/CONFSERVER-38237
if (!content.includes("Content-Type: multipart/related")) {
throw FileImportError("Unsupported Word file");
}
// Confluence "Word" documents are actually just multi-part email messages, so we can use
// mailparser to parse the content.
const parsed = await simpleParser(content);
if (!parsed.html) {
throw FileImportError("Unsupported Word file (No content found)");
}
// Replace the content-location with a data URI for each attachment.
for (const attachment of parsed.attachments) {
const contentLocation = String(
attachment.headers.get("content-location") ?? ""
);
const id = contentLocation.split("/").pop();
if (!id) {
continue;
}
parsed.html = parsed.html.replace(
new RegExp(escapeRegExp(id), "g"),
`data:image/png;base64,${attachment.content.toString("base64")}`
);
}
// If we don't remove the title here it becomes printed in the document
// body by turndown
turndownService.remove(["style", "title"]);
// Now we should have something that looks like HTML
const html = turndownService.turndown(parsed.html);
return html.replace(/<br>/g, " \\n ");
/**
* Convert a Buffer to a string.
*
* @param content The content as a Buffer or string.
* @returns The content as a string.
*/
private static bufferToString(content: Buffer | string): string {
return typeof content === "string" ? content : content.toString("utf8");
}
}
-15
View File
@@ -1,15 +0,0 @@
import type TurndownService from "turndown";
/**
* A turndown plugin for converting break tags to newlines.
*
* @param turndownService The TurndownService instance.
*/
export default function breaks(turndownService: TurndownService) {
turndownService.addRule("breaks", {
filter: ["br"],
replacement() {
return "\\n";
},
});
}
-22
View File
@@ -1,22 +0,0 @@
import type TurndownService from "turndown";
/**
* A turndown plugin for unwrapping top-level empty list items.
*
* @param turndownService The TurndownService instance.
*/
export default function emptyLists(turndownService: TurndownService) {
turndownService.addRule("empty-lists", {
filter(node) {
return (
node.nodeName === "LI" &&
node.childNodes.length === 1 &&
(node.firstChild?.nodeName === "OL" ||
node.firstChild?.nodeName === "UL")
);
},
replacement(content) {
return content;
},
});
}
-22
View File
@@ -1,22 +0,0 @@
import type TurndownService from "turndown";
/**
* A turndown plugin for converting paragraphs with only breaks to newlines.
*
* @param turndownService The TurndownService instance.
*/
export default function emptyParagraphs(turndownService: TurndownService) {
turndownService.addRule("emptyParagraphs", {
filter(node) {
return (
node.nodeName === "P" &&
node.children.length === 1 &&
node.textContent?.trim() === "" &&
node.children[0].nodeName === "BR"
);
},
replacement() {
return "\n\n\\\n";
},
});
}
-21
View File
@@ -1,21 +0,0 @@
import type TurndownService from "turndown";
/**
* A turndown plugin to convert iframes to markdown links.
*
* @param turndownService The TurndownService instance.
*/
export default function images(turndownService: TurndownService) {
turndownService.addRule("frames", {
filter: "iframe",
replacement(content, node: HTMLIFrameElement) {
const src = (node.getAttribute("src") || "").replace(/\n+/g, "");
const title = cleanAttribute(node.getAttribute("title") || "");
return src ? "[" + (title || src) + "]" + "(" + src + ")" : "";
},
});
}
function cleanAttribute(attribute: string) {
return attribute ? attribute.replace(/(\n+\s*)+/g, "\n") : "";
}
-50
View File
@@ -1,50 +0,0 @@
import type TurndownService from "turndown";
/**
* A turndown plugin overriding inbuilt image parsing behavior
*
* @param turndownService The TurndownService instance.
*/
export default function images(turndownService: TurndownService) {
turndownService.addRule("image", {
filter(node) {
return node.nodeName === "IMG" && !node?.className.includes("emoticon");
},
replacement(content, node) {
if (!("className" in node)) {
return content;
}
const alt = cleanAttribute(node.getAttribute("alt") || "");
const src = cleanAttribute(node.getAttribute("src") || "");
const title = cleanAttribute(node.getAttribute("title") || "");
// Remove icons in issue keys as they will not resolve correctly and mess
// up the layout.
if (
node.className === "icon" &&
node.parentElement?.className.includes("jira-issue-key")
) {
return "";
}
// Respect embedded Confluence image size
let size;
const naturalWidth = node.getAttribute("data-width");
const naturalHeight = node.getAttribute("data-height");
const width = node.getAttribute("width");
if (naturalWidth && naturalHeight && width) {
const ratio = parseInt(naturalWidth) / parseInt(width);
size = ` =${width}x${parseInt(naturalHeight) / ratio}`;
}
const titlePart = title || size ? ` "${title}${size}"` : "";
return src ? `![${alt}](${src}${titlePart})` : "";
},
});
}
function cleanAttribute(attribute: string) {
return (attribute ? attribute.replace(/\n+/g, "") : "").trim();
}
-47
View File
@@ -1,47 +0,0 @@
import { taskListItems, strikethrough } from "@joplin/turndown-plugin-gfm";
import TurndownService from "turndown";
import { escape } from "@shared/utils/markdown";
import breaks from "./breaks";
import emptyLists from "./emptyLists";
import emptyParagraph from "./emptyParagraph";
import frames from "./frames";
import images from "./images";
import inlineLink from "./inlineLink";
import sanitizeLists from "./sanitizeLists";
import sanitizeTables from "./sanitizeTables";
import tables from "./tables";
import underlines from "./underlines";
import { inHtmlContext } from "./utils";
/**
* Turndown converts HTML to Markdown and is used in the importer code.
*
* For options, see: https://github.com/domchristie/turndown#options
*/
const service = new TurndownService({
hr: "---",
bulletListMarker: "-",
headingStyle: "atx",
codeBlockStyle: "fenced",
blankReplacement: (_, node) =>
node.nodeName === "P" && !inHtmlContext(node as HTMLElement, "td, th")
? "\n\n\\\n"
: "",
})
.remove(["script", "style", "title", "head"])
.use(taskListItems)
.use(strikethrough)
.use(tables)
.use(inlineLink)
.use(emptyParagraph)
.use(sanitizeTables)
.use(sanitizeLists)
.use(underlines)
.use(frames)
.use(images)
.use(breaks)
.use(emptyLists);
service.escape = escape;
export default service;
-22
View File
@@ -1,22 +0,0 @@
import type TurndownService from "turndown";
/**
* A turndown plugin for converting anchors to inline links without a title.
*
* @param turndownService The TurndownService instance.
*/
export default function underlines(turndownService: TurndownService) {
turndownService.addRule("inlineLink", {
filter(node, options) {
return !!(
options.linkStyle === "inlined" &&
node.nodeName === "A" &&
node.getAttribute("href")
);
},
replacement(content, node: HTMLElement) {
const href = node.getAttribute("href");
return "[" + content + "](" + href + ")";
},
});
}
-64
View File
@@ -1,64 +0,0 @@
import type TurndownService from "turndown";
import { inHtmlContext } from "./utils";
/**
* A turndown plugin for removing incompatible nodes from lists.
*
* @param turndownService The TurndownService instance.
*/
export default function sanitizeLists(turndownService: TurndownService) {
// Fork of default functionality to only use a single space between marker and content
// See: https://github.com/mixmark-io/turndown/blob/cc73387fb707e5fb5e1083e94078d08f38f3abc8/src/commonmark-rules.js#L61
turndownService.addRule("listItem", {
filter: "li",
replacement(content, node, options) {
content = content
.replace(/^\n+/, "") // remove leading newlines
.replace(/\n+$/, "\n") // replace trailing newlines with just a single one
.replace(/\n/gm, "\n "); // 4 space indent
let prefix = options.bulletListMarker + " ";
const parent = node.parentNode;
if (parent && parent.nodeName === "OL") {
const start = (parent as HTMLElement).getAttribute("start");
const index = Array.prototype.indexOf.call(parent.children, node);
prefix = (start ? Number(start) + index : index + 1) + ". ";
}
const output =
prefix +
content +
(node.nextSibling && !/\n$/.test(content) ? "\n" : "");
return output;
},
});
turndownService.addRule("headingsInLists", {
filter(node) {
return (
["H1", "H2", "H3", "H4", "H5", "H6"].includes(node.nodeName) &&
inHtmlContext(node, "LI")
);
},
replacement(content, node, options) {
if (!content.trim()) {
return "";
}
return options.strongDelimiter + content + options.strongDelimiter;
},
});
turndownService.addRule("strongInHeadings", {
filter(node) {
return (
(node.nodeName === "STRONG" || node.nodeName === "B") &&
["H1", "H2", "H3", "H4", "H5", "H6"].some((tag) =>
inHtmlContext(node, tag)
)
);
},
replacement(content) {
return content;
},
});
}
-30
View File
@@ -1,30 +0,0 @@
import type TurndownService from "turndown";
import { inHtmlContext } from "./utils";
/**
* A turndown plugin for removing incompatible nodes from tables.
*
* @param turndownService The TurndownService instance.
*/
export default function sanitizeTables(turndownService: TurndownService) {
turndownService.addRule("headingsInTables", {
filter(node) {
return (
["H1", "H2", "H3", "H4", "H5", "H6"].includes(node.nodeName) &&
inHtmlContext(node, "table")
);
},
replacement(content) {
return `**${content.trim()}**`;
},
});
turndownService.addRule("paragraphsInCells", {
filter(node) {
return node.nodeName === "P" && inHtmlContext(node, "table");
},
replacement(content, node) {
return content.trim() + (node.nextSibling ? "\\n" : "");
},
});
}
-325
View File
@@ -1,325 +0,0 @@
// Based on https://www.npmjs.com/package/joplin-turndown-plugin-gfm
import type TurndownService from "turndown";
import { inHtmlContext } from "./utils";
const rules: Record<string, TurndownService.Rule> = {};
const alignMap = { left: ":---", right: "---:", center: ":---:" };
// Note use of WeakMap to enable garbage collection
const tableShouldBeSkippedCache = new WeakMap<HTMLTableElement, boolean>();
function getAlignment(node: HTMLElement) {
return node
? ((
node.getAttribute("align") ||
node.style.textAlign ||
""
).toLowerCase() as "left" | "right" | "center")
: "";
}
function getBorder(alignment: keyof typeof alignMap) {
return alignment ? alignMap[alignment] : "---";
}
function getColumnAlignment(
table: HTMLTableElement | null,
columnIndex: number
) {
const votes = {
left: 0,
right: 0,
center: 0,
"": 0,
};
let align: keyof typeof alignMap = "left";
if (!table) {
return align;
}
// Reference is important as .rows is an expensive getter.
const rows = table.rows;
for (let i = 0; i < rows.length; ++i) {
const row = rows[i];
if (columnIndex < row.childNodes.length) {
const cellAlignment = getAlignment(
row.childNodes[columnIndex] as HTMLElement
);
++votes[cellAlignment];
if (
votes[cellAlignment] > votes[align] &&
Object.keys(alignMap).includes(cellAlignment)
) {
align = cellAlignment as keyof typeof alignMap;
}
}
}
return align;
}
rules.tableCell = {
filter: ["th", "td"],
replacement(content, node: HTMLTableCellElement) {
if (tableShouldBeSkipped(nodeParentTable(node))) {
return content;
}
return cell(content, node);
},
};
rules.tableRow = {
filter: "tr",
replacement(content, node: HTMLTableRowElement) {
const parentTable = nodeParentTable(node);
if (tableShouldBeSkipped(parentTable)) {
return content;
}
let borderCells = "";
if (isHeadingRow(node)) {
const colCount = tableColCount(parentTable);
for (let i = 0; i < colCount; i++) {
const childNode =
i < node.childNodes.length ? node.childNodes[i] : null;
const border = getBorder(getColumnAlignment(parentTable, i));
borderCells += cell(border, childNode, i);
}
}
return "\n" + content + (borderCells ? "\n" + borderCells : "");
},
};
rules.table = {
// Only convert tables that can result in valid Markdown
// Other tables are kept as HTML using `keep` (see below).
filter(node) {
return node.nodeName === "TABLE" && !tableShouldBeHtml(node);
},
replacement(content, node: HTMLTableElement) {
if (tableShouldBeSkipped(node)) {
return content;
}
// Ensure there are no blank lines
content = content.replace(/\n+/g, "\n");
// If table has no heading, add an empty one so as to get a valid Markdown table
const secondLineParts = content.trim().split("\n");
let secondLine = "";
if (secondLineParts.length >= 2) {
secondLine = secondLineParts[1];
}
const secondLineIsDivider = /\| :?---/.test(secondLine);
const columnCount = tableColCount(node);
let emptyHeader = "";
if (columnCount && !secondLineIsDivider) {
emptyHeader = "|" + " |".repeat(columnCount) + "\n" + "|";
for (let columnIndex = 0; columnIndex < columnCount; ++columnIndex) {
emptyHeader +=
" " + getBorder(getColumnAlignment(node, columnIndex)) + " |";
}
}
return "\n\n" + emptyHeader + content + "\n\n";
},
};
rules.tableSection = {
filter: ["thead", "tbody", "tfoot"],
replacement(content) {
return content;
},
};
/**
* A tr is a heading row if the parent is a THEAD or its the first child of the TABLE or the first
* TBODY (possibly following a blank THEAD) and every cell is a TH.
*
* @param tr The tr node to check
* @returns Whether the tr is a heading row
*/
function isHeadingRow(tr: Node) {
const parentNode = tr.parentNode;
if (!parentNode) {
return false;
}
return (
parentNode.nodeName === "THEAD" ||
Array.from(tr.childNodes).every((n) => n.nodeName === "TH")
);
}
function cell(
content: string,
node: ChildNode | null = null,
index: number | null = null
) {
if (index === null && node) {
index = Array.from(node?.parentNode?.childNodes ?? []).indexOf(node);
}
let prefix = " ";
if (index === 0) {
prefix = "| ";
}
let filteredContent = content
.trim()
.replace(/\n\r/g, "<br>")
.replace(/\n/g, "<br>");
filteredContent = filteredContent.replace(/\|+/g, "\\|");
while (filteredContent.length < 3) {
filteredContent += " ";
}
if (node) {
filteredContent = handleColSpan(filteredContent, node, " ");
}
return prefix + filteredContent + " |";
}
function nodeContainsTable(node: Node) {
if (!node?.childNodes) {
return false;
}
for (let i = 0; i < node.childNodes.length; i++) {
const child = node.childNodes[i];
if (child.nodeName === "TABLE") {
return true;
}
if (nodeContainsTable(child)) {
return true;
}
}
return false;
}
const nodeContains = (node: HTMLElement, types: string | string[]) => {
if (!node?.childNodes) {
return false;
}
for (let i = 0; i < node.childNodes.length; i++) {
const child = node.childNodes[i] as HTMLElement;
if (types === "code" && inHtmlContext(child, "CODE")) {
return true;
}
if (types.includes(child.nodeName)) {
return true;
}
if (nodeContains(child, types)) {
return true;
}
}
return false;
};
const tableShouldBeHtml = (tableNode: HTMLElement) =>
nodeContains(tableNode, ["code", "table"]);
// Various conditions under which a table should be skipped - i.e. each cell
// will be rendered one after the other as if they were paragraphs.
function tableShouldBeSkipped(tableNode: HTMLTableElement | null) {
if (!tableNode) {
return true;
}
const cached = tableShouldBeSkippedCache.get(tableNode);
if (cached !== undefined) {
return cached;
}
const process = () => {
if (!tableNode) {
return true;
}
// Reference is important as .rows is an expensive getter.
const rows = tableNode.rows;
if (!rows) {
return true;
}
if (rows.length === 1 && rows[0].childNodes.length <= 1) {
return true;
}
if (nodeContainsTable(tableNode)) {
return true;
}
return false;
};
const result = process();
tableShouldBeSkippedCache.set(tableNode, result);
return result;
}
function nodeParentTable(
node: HTMLTableCellElement | HTMLTableRowElement
): HTMLTableElement | null {
let parent = node.parentNode;
if (!parent) {
return null;
}
while (parent.nodeName !== "TABLE") {
parent = parent.parentNode;
if (!parent) {
return null;
}
}
return parent as HTMLTableElement;
}
function handleColSpan(content: string, node: ChildNode, emptyChar: string) {
if (!node) {
return content;
}
const colspan = Number((node as HTMLElement).getAttribute("colspan") || 1);
for (let i = 1; i < colspan; i++) {
content += " | " + emptyChar.repeat(3);
}
return content;
}
function tableColCount(node: HTMLTableElement | null) {
if (!node) {
return 0;
}
let maxColCount = 0;
// Reference is important as .rows is an expensive getter.
const rows = node.rows;
for (let i = 0; i < rows.length; i++) {
const row = rows[i];
const colCount = row.childNodes.length;
if (colCount > maxColCount) {
maxColCount = colCount;
}
}
return maxColCount;
}
export default function tables(turndownService: TurndownService) {
turndownService.keep(function (node) {
if (node.nodeName === "TABLE" && tableShouldBeHtml(node)) {
return true;
}
return false;
});
for (const key in rules) {
turndownService.addRule(key, rules[key]);
}
}
-15
View File
@@ -1,15 +0,0 @@
import type TurndownService from "turndown";
/**
* A turndown plugin for converting u tags to underlines.
*
* @param turndownService The TurndownService instance.
*/
export default function underlines(turndownService: TurndownService) {
turndownService.addRule("underlines", {
filter: ["u"],
replacement(content) {
return `__${content.trim()}__`;
},
});
}
-13
View File
@@ -1,13 +0,0 @@
export function inHtmlContext(node: HTMLElement, selector: string) {
let currentNode = node;
// start at the closest element
while (currentNode !== null && currentNode.nodeType !== 1) {
currentNode = (currentNode.parentElement ||
currentNode.parentNode) as HTMLElement;
}
return (
currentNode !== null &&
currentNode.nodeType === 1 &&
currentNode.closest(selector) !== null
);
}
-33
View File
@@ -4079,13 +4079,6 @@ __metadata:
languageName: node
linkType: hard
"@joplin/turndown-plugin-gfm@npm:^1.0.49":
version: 1.0.64
resolution: "@joplin/turndown-plugin-gfm@npm:1.0.64"
checksum: 10c0/cbbcba0f111e420535fc1693c1ff859ca7fae5a869a7891ec6cd9ead2f94cec8e858938dcb6ba379c160d086a7fbfadadd53cc9e79d755100a4d0b1cf77947fc
languageName: node
linkType: hard
"@jridgewell/gen-mapping@npm:^0.3.12, @jridgewell/gen-mapping@npm:^0.3.5":
version: 0.3.13
resolution: "@jridgewell/gen-mapping@npm:0.3.13"
@@ -4368,13 +4361,6 @@ __metadata:
languageName: node
linkType: hard
"@mixmark-io/domino@npm:^2.2.0":
version: 2.2.0
resolution: "@mixmark-io/domino@npm:2.2.0"
checksum: 10c0/aa468a15f9217d425220fe6a4b3f9416cbe8e566ee14efc191c6d5cc04fe39338b16a90bbac190f28d44e69465db5f2cf95f479c621ce38060ca6b2a3d346e9d
languageName: node
linkType: hard
"@msgpackr-extract/msgpackr-extract-darwin-arm64@npm:3.0.3":
version: 3.0.3
resolution: "@msgpackr-extract/msgpackr-extract-darwin-arm64@npm:3.0.3"
@@ -8774,13 +8760,6 @@ __metadata:
languageName: node
linkType: hard
"@types/turndown@npm:^5.0.6":
version: 5.0.6
resolution: "@types/turndown@npm:5.0.6"
checksum: 10c0/cc5648c115b67ba413782fd0a8ae273ad6b87940df770ab9a5fefe0303c368704013fca2a55dd08f46a2132a747912fd47f96a83162c47fd189babf1352ac4be
languageName: node
linkType: hard
"@types/unist@npm:^2":
version: 2.0.11
resolution: "@types/unist@npm:2.0.11"
@@ -17313,7 +17292,6 @@ __metadata:
"@hocuspocus/extension-throttle": "npm:1.1.2"
"@hocuspocus/provider": "npm:1.1.2"
"@hocuspocus/server": "npm:1.1.2"
"@joplin/turndown-plugin-gfm": "npm:^1.0.49"
"@juggle/resize-observer": "npm:^3.4.0"
"@linear/sdk": "npm:^58.1.0"
"@node-oauth/oauth2-server": "npm:^5.2.0"
@@ -17404,7 +17382,6 @@ __metadata:
"@types/styled-components": "npm:^5.1.32"
"@types/throng": "npm:^5.0.7"
"@types/tmp": "npm:^0.2.6"
"@types/turndown": "npm:^5.0.6"
"@types/utf8": "npm:^3.0.3"
"@types/validator": "npm:^13.15.3"
"@types/yauzl": "npm:^2.10.3"
@@ -17580,7 +17557,6 @@ __metadata:
tiny-cookie: "npm:^2.5.1"
tmp: "npm:^0.2.5"
tunnel-agent: "npm:^0.6.0"
turndown: "npm:^7.2.2"
typescript: "npm:^5.9.2"
ukkonen: "npm:^2.2.0"
umzug: "npm:^3.8.2"
@@ -21354,15 +21330,6 @@ __metadata:
languageName: node
linkType: hard
"turndown@npm:^7.2.2":
version: 7.2.2
resolution: "turndown@npm:7.2.2"
dependencies:
"@mixmark-io/domino": "npm:^2.2.0"
checksum: 10c0/ee09f7bd67c468505aad6c3a26b11269ca49ffce07eaa9c212926d068f242b11b4e955b31a58289f26674ff29f91209b29454907551dcaec7da712e524cc78c2
languageName: node
linkType: hard
"type-detect@npm:4.0.8":
version: 4.0.8
resolution: "type-detect@npm:4.0.8"