diff --git a/package.json b/package.json index d78d224741..1ce3549599 100644 --- a/package.json +++ b/package.json @@ -296,6 +296,7 @@ "@types/invariant": "^2.2.37", "@types/ioredis-mock": "^8.2.6", "@types/jest": "^29.5.14", + "@types/js-yaml": "^4.0.9", "@types/jsonwebtoken": "^8.5.9", "@types/katex": "^0.16.7", "@types/koa": "^2.15.0", diff --git a/server/commands/documentImporter.test.ts b/server/commands/documentImporter.test.ts index 92d675acfd..1356f99cec 100644 --- a/server/commands/documentImporter.test.ts +++ b/server/commands/documentImporter.test.ts @@ -213,6 +213,32 @@ describe("documentImporter", () => { expect(response.title).toEqual("Title"); }); + it("should convert frontmatter to yaml codeblock", async () => { + const user = await buildUser(); + const fileName = "markdown-frontmatter.md"; + const content = await fs.readFile( + path.resolve(__dirname, "..", "test", "fixtures", fileName), + "utf8" + ); + const response = await sequelize.transaction((transaction) => + documentImporter({ + user, + mimeType: "text/plain", + fileName, + content, + ctx: createContext({ user, transaction }), + }) + ); + + expect(response.text).toContain("```yaml"); + expect(response.text).toContain("title: Test Document"); + expect(response.text).toContain("date: 2024-01-15"); + expect(response.text).toContain("tags: [test, markdown]"); + expect(response.text).toContain("```"); + expect(response.text).toContain("This is content after frontmatter"); + expect(response.title).toEqual("Heading 1"); + }); + it("should fallback to extension if mimetype unknown", async () => { const user = await buildUser(); const fileName = "markdown.md"; diff --git a/server/test/fixtures/markdown-frontmatter.md b/server/test/fixtures/markdown-frontmatter.md new file mode 100644 index 0000000000..69a7c35b82 --- /dev/null +++ b/server/test/fixtures/markdown-frontmatter.md @@ -0,0 +1,14 @@ +--- +title: Test Document +date: 2024-01-15 +tags: [test, markdown] +author: John Doe +--- + +# Heading 1 + +This is content after frontmatter. + +## Heading 2 + +More content here. diff --git a/server/utils/DocumentConverter.test.ts b/server/utils/DocumentConverter.test.ts index 6a545170e8..3857992a07 100644 --- a/server/utils/DocumentConverter.test.ts +++ b/server/utils/DocumentConverter.test.ts @@ -148,6 +148,114 @@ Jane,24,`; expect(result.title).toEqual(""); expect(result.text).toContain("Subtitle"); }); + + it("should convert frontmatter to yaml codeblock", async () => { + const md = `--- +title: Test Document +date: 2024-01-15 +tags: [test, markdown] +--- + +# My Title + +Content after frontmatter`; + const result = await DocumentConverter.convert( + md, + "test.md", + "text/markdown" + ); + + // Frontmatter should be converted to a YAML codeblock + expect(result.text).toContain("```yaml"); + expect(result.text).toContain("title: Test Document"); + expect(result.text).toContain("date: 2024-01-15"); + expect(result.text).toContain("tags: [test, markdown]"); + expect(result.text).toContain("```"); + // Content should still be present + expect(result.text).toContain("Content after frontmatter"); + // H1 should be extracted as title + expect(result.title).toEqual("My Title"); + }); + + it("should handle markdown without frontmatter", async () => { + const md = "# Title\n\nRegular content"; + const result = await DocumentConverter.convert( + md, + "test.md", + "text/markdown" + ); + + expect(result.title).toEqual("Title"); + expect(result.text).toContain("Regular content"); + expect(result.text).not.toContain("```yaml"); + }); + + it("should handle frontmatter with no content after", async () => { + const md = `--- +title: Only Frontmatter +---`; + const result = await DocumentConverter.convert( + md, + "test.md", + "text/markdown" + ); + + expect(result.text).toContain("```yaml"); + expect(result.text).toContain("title: Only Frontmatter"); + expect(result.text).toContain("```"); + expect(result.title).toEqual(""); + }); + + it("should not convert incomplete frontmatter", async () => { + const md = `--- +title: Test +Content without closing delimiter`; + const result = await DocumentConverter.convert( + md, + "test.md", + "text/markdown" + ); + + // Should not convert as it's not proper frontmatter + expect(result.text).not.toContain("```yaml"); + expect(result.text).toContain("title: Test"); + }); + + it("should not convert frontmatter if not at start", async () => { + const md = `# Title + +Some content + +--- +title: Test +--- + +More content`; + const result = await DocumentConverter.convert( + md, + "test.md", + "text/markdown" + ); + + // Should not convert as frontmatter must be at the start + expect(result.text).not.toContain("```yaml"); + }); + + it("should handle invalid YAML in frontmatter", async () => { + const md = `--- +invalid: yaml: content: here +--- + +Content`; + const result = await DocumentConverter.convert( + md, + "test.md", + "text/markdown" + ); + + // Should not convert invalid YAML + expect(result.text).not.toContain("```yaml"); + }); }); }); diff --git a/server/utils/DocumentConverter.ts b/server/utils/DocumentConverter.ts index 69c0817dc0..1e9ab3662f 100644 --- a/server/utils/DocumentConverter.ts +++ b/server/utils/DocumentConverter.ts @@ -5,6 +5,7 @@ import { simpleParser } from "mailparser"; import mammoth from "mammoth"; import type { Node } from "prosemirror-model"; import { DOMParser as ProsemirrorDOMParser } from "prosemirror-model"; +import yaml from "js-yaml"; import { ProsemirrorHelper as SharedProsemirrorHelper } from "@shared/utils/ProsemirrorHelper"; import { schema, serializer } from "@server/editor"; import { FileImportError } from "@server/errors"; @@ -201,24 +202,30 @@ export class DocumentConverter { fileName: string, mimeType: string ): Promise { + let markdown: string; + switch (mimeType) { case "text/plain": case "text/markdown": - return this.bufferToString(content); + markdown = this.bufferToString(content); + break; case "text/csv": return this.csvToMarkdown(content); - default: - break; + default: { + const extension = fileName.split(".").pop(); + switch (extension) { + case "md": + case "markdown": + markdown = this.bufferToString(content); + break; + default: + throw FileImportError(`File type ${mimeType} not supported`); + } + } } - const extension = fileName.split(".").pop(); - switch (extension) { - case "md": - case "markdown": - return this.bufferToString(content); - default: - throw FileImportError(`File type ${mimeType} not supported`); - } + // Process frontmatter and convert it to a YAML codeblock + return this.processFrontmatter(markdown); } /** @@ -404,4 +411,37 @@ export class DocumentConverter { private static bufferToString(content: Buffer | string): string { return typeof content === "string" ? content : content.toString("utf8"); } + + /** + * Parse and convert frontmatter to a YAML codeblock. + * + * @param content The markdown content that may contain frontmatter. + * @returns The markdown content with frontmatter converted to a YAML codeblock. + */ + private static processFrontmatter(content: string): string { + // Frontmatter must start at the beginning of the document + const frontmatterRegex = /^---\n([\s\S]*?)\n---(?:\n|$)/; + const match = content.match(frontmatterRegex); + + if (!match) { + return content; + } + + const frontmatterContent = match[1]; + const remainingContent = content.slice(match[0].length); + + // Validate that the frontmatter is valid YAML + try { + yaml.load(frontmatterContent); + } catch { + // If it's not valid YAML, return content unchanged + return content; + } + + // Convert frontmatter to a YAML codeblock + const codeBlockDelimiter = "```"; + const yamlCodeblock = `${codeBlockDelimiter}yaml\n${frontmatterContent}\n${codeBlockDelimiter}\n\n`; + + return yamlCodeblock + remainingContent; + } } diff --git a/yarn.lock b/yarn.lock index aab39e8dee..39b5594279 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8119,6 +8119,13 @@ __metadata: languageName: node linkType: hard +"@types/js-yaml@npm:^4.0.9": + version: 4.0.9 + resolution: "@types/js-yaml@npm:4.0.9" + checksum: 10c0/24de857aa8d61526bbfbbaa383aa538283ad17363fcd5bb5148e2c7f604547db36646440e739d78241ed008702a8920665d1add5618687b6743858fae00da211 + languageName: node + linkType: hard + "@types/jsdom@npm:^20.0.0": version: 20.0.1 resolution: "@types/jsdom@npm:20.0.1" @@ -17337,6 +17344,7 @@ __metadata: "@types/invariant": "npm:^2.2.37" "@types/ioredis-mock": "npm:^8.2.6" "@types/jest": "npm:^29.5.14" + "@types/js-yaml": "npm:^4.0.9" "@types/jsonwebtoken": "npm:^8.5.9" "@types/katex": "npm:^0.16.7" "@types/koa": "npm:^2.15.0"