feat: Adds support for importing CSV files (#7912)

* feat: Adds support for importing CSV files

* test

* tsc
This commit is contained in:
Tom Moor
2024-11-07 22:09:02 -05:00
committed by GitHub
parent 9b26ccda19
commit 55ffd6d098
5 changed files with 121 additions and 1 deletions
+1
View File
@@ -63,6 +63,7 @@ export default class DocumentsStore extends Store<Document> {
".md",
".doc",
".docx",
"text/csv",
"text/markdown",
"text/plain",
"text/html",
+1
View File
@@ -68,6 +68,7 @@
"@dnd-kit/modifiers": "^6.0.1",
"@dnd-kit/sortable": "^7.0.2",
"@emoji-mart/data": "^1.2.1",
"@fast-csv/parse": "^5.0.2",
"@fortawesome/fontawesome-svg-core": "^6.5.2",
"@fortawesome/free-brands-svg-icons": "^6.5.2",
"@fortawesome/free-solid-svg-icons": "^6.5.2",
+31
View File
@@ -0,0 +1,31 @@
import { DocumentConverter } from "./DocumentConverter";
describe("csvToMarkdown", () => {
it("should convert csv to markdown with comma", async () => {
const csv = `name,age
John,25
Jane,24`;
const markdown = `| name | age |
| --- | --- |
| John | 25 |
| Jane | 24 |
`;
expect(await DocumentConverter.csvToMarkdown(csv)).toEqual(markdown);
});
it("should convert csv to markdown with semicolon", async () => {
const csv = `name;age
John;25
"Joan ""the bone"", Anne";24`;
const markdown = `| name | age |
| --- | --- |
| John | 25 |
| Joan "the bone", Anne | 24 |
`;
expect(await DocumentConverter.csvToMarkdown(csv)).toEqual(markdown);
});
});
+46 -1
View File
@@ -1,3 +1,4 @@
import { parse } from "@fast-csv/parse";
import escapeRegExp from "lodash/escapeRegExp";
import { simpleParser } from "mailparser";
import mammoth from "mammoth";
@@ -30,6 +31,8 @@ export class DocumentConverter {
case "text/plain":
case "text/markdown":
return this.fileToMarkdown(content);
case "text/csv":
return this.csvToMarkdown(content);
default:
break;
}
@@ -71,7 +74,49 @@ export class DocumentConverter {
return turndownService.turndown(content);
}
public static async fileToMarkdown(content: Buffer | string) {
public static csvToMarkdown(content: Buffer | string): Promise<string> {
return new Promise((resolve, reject) => {
const text = this.fileToMarkdown(content).trim();
const firstLine = text.split("\n")[0];
// Determine the separator used in the CSV file based on number of occurrences of each separator on first line
const delimiter = [";", ",", "\t"].reduce(
(acc, separator) => {
const count = (
firstLine.match(new RegExp(escapeRegExp(separator), "g")) || []
).length;
return count > acc.count ? { count, separator } : acc;
},
{ count: 0, separator: "," }
).separator;
const lines: string[][] = [];
const stream = parse({ delimiter })
.on("error", (error) => {
reject(
FileImportError(`There was an error parsing the CSV file: ${error}`)
);
})
.on("data", (row) => lines.push(row))
.on("end", () => {
const headers = lines[0];
const table = lines
.slice(1)
.map((cells) => `| ${cells.join(" | ")} |`)
.join("\n");
const headerLine = `| ${headers.join(" | ")} |`;
const separatorLine = `| ${headers.map(() => "---").join(" | ")} |`;
resolve(`${headerLine}\n${separatorLine}\n${table}\n`);
});
stream.write(text);
stream.end();
});
}
public static fileToMarkdown(content: Buffer | string) {
if (content instanceof Buffer) {
content = content.toString("utf8");
}
+42
View File
@@ -2255,6 +2255,18 @@
resolved "https://registry.yarnpkg.com/@faker-js/faker/-/faker-8.4.1.tgz#5d5e8aee8fce48f5e189bf730ebd1f758f491451"
integrity sha512-XQ3cU+Q8Uqmrbf2e0cIC/QN43sTBSC8KF12u29Mb47tWrt2hAgBXSgpZMj4Ao8Uk0iJcU99QsOCaIL8934obCg==
"@fast-csv/parse@^5.0.2":
version "5.0.2"
resolved "https://registry.yarnpkg.com/@fast-csv/parse/-/parse-5.0.2.tgz#204000dfd661b580a10a8cd035a0e986fc6954a9"
integrity sha512-gMu1Btmm99TP+wc0tZnlH30E/F1Gw1Tah3oMDBHNPe9W8S68ixVHjt89Wg5lh7d9RuQMtwN+sGl5kxR891+fzw==
dependencies:
lodash.escaperegexp "^4.1.2"
lodash.groupby "^4.6.0"
lodash.isfunction "^3.0.9"
lodash.isnil "^4.0.0"
lodash.isundefined "^3.0.1"
lodash.uniq "^4.5.0"
"@formatjs/ecma402-abstract@1.12.0":
version "1.12.0"
resolved "https://registry.yarnpkg.com/@formatjs/ecma402-abstract/-/ecma402-abstract-1.12.0.tgz#2fb5e8983d5fae2fad9ec6c77aec1803c2b88d8e"
@@ -10955,6 +10967,16 @@ lodash.defaults@^4.2.0:
resolved "https://registry.yarnpkg.com/lodash.defaults/-/lodash.defaults-4.2.0.tgz#d09178716ffea4dde9e5fb7b37f6f0802274580c"
integrity "sha1-0JF4cW/+pN3p5ft7N/bwgCJ0WAw= sha512-qjxPLHd3r5DnsdGacqOMU6pb/avJzdh9tFX2ymgoZE27BmjXrNy/y4LoaiTeAb+O3gL8AfpJGtqfX/ae2leYYQ=="
lodash.escaperegexp@^4.1.2:
version "4.1.2"
resolved "https://registry.yarnpkg.com/lodash.escaperegexp/-/lodash.escaperegexp-4.1.2.tgz#64762c48618082518ac3df4ccf5d5886dae20347"
integrity sha512-TM9YBvyC84ZxE3rgfefxUWiQKLilstD6k7PTGt6wfbtXF8ixIJLOL3VYyV/z+ZiPLsVxAsKAFVwWlWeb2Y8Yyw==
lodash.groupby@^4.6.0:
version "4.6.0"
resolved "https://registry.yarnpkg.com/lodash.groupby/-/lodash.groupby-4.6.0.tgz#0b08a1dcf68397c397855c3239783832df7403d1"
integrity sha512-5dcWxm23+VAoz+awKmBaiBvzox8+RqMgFhi7UvX9DHZr2HdxHXM/Wrf8cfKpsW37RNrvtPn6hSwNqurSILbmJw==
lodash.includes@^4.3.0:
version "4.3.0"
resolved "https://registry.yarnpkg.com/lodash.includes/-/lodash.includes-4.3.0.tgz#60bb98a87cb923c68ca1e51325483314849f553f"
@@ -10970,11 +10992,21 @@ lodash.isboolean@^3.0.3:
resolved "https://registry.yarnpkg.com/lodash.isboolean/-/lodash.isboolean-3.0.3.tgz#6c2e171db2a257cd96802fd43b01b20d5f5870f6"
integrity sha512-Bz5mupy2SVbPHURB98VAcw+aHh4vRV5IPNhILUCsOzRmsTmSQ17jIuqopAentWoehktxGd9e/hbIXq980/1QJg==
lodash.isfunction@^3.0.9:
version "3.0.9"
resolved "https://registry.yarnpkg.com/lodash.isfunction/-/lodash.isfunction-3.0.9.tgz#06de25df4db327ac931981d1bdb067e5af68d051"
integrity sha512-AirXNj15uRIMMPihnkInB4i3NHeb4iBtNg9WRWuK2o31S+ePwwNmDPaTL3o7dTJ+VXNZim7rFs4rxN4YU1oUJw==
lodash.isinteger@^4.0.4:
version "4.0.4"
resolved "https://registry.yarnpkg.com/lodash.isinteger/-/lodash.isinteger-4.0.4.tgz#619c0af3d03f8b04c31f5882840b77b11cd68343"
integrity sha512-DBwtEWN2caHQ9/imiNeEA5ys1JoRtRfY3d7V9wkqtbycnAmTvRRmbHKDV4a0EYc678/dia0jrte4tjYwVBaZUA==
lodash.isnil@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/lodash.isnil/-/lodash.isnil-4.0.0.tgz#49e28cd559013458c814c5479d3c663a21bfaa6c"
integrity sha512-up2Mzq3545mwVnMhTDMdfoG1OurpA/s5t88JmQX809eH3C8491iu2sfKhTfhQtKY78oPNhiaHJUpT/dUDAAtng==
lodash.isnumber@^3.0.3:
version "3.0.3"
resolved "https://registry.yarnpkg.com/lodash.isnumber/-/lodash.isnumber-3.0.3.tgz#3ce76810c5928d03352301ac287317f11c0b1ffc"
@@ -10990,6 +11022,11 @@ lodash.isstring@^4.0.1:
resolved "https://registry.yarnpkg.com/lodash.isstring/-/lodash.isstring-4.0.1.tgz#d527dfb5456eca7cc9bb95d5daeaf88ba54a5451"
integrity sha512-0wJxfxH1wgO3GrbuP+dTTk7op+6L41QCXbGINEmD+ny/G/eCqGzxyCsh7159S+mgDDcoarnBw6PC1PS5+wUGgw==
lodash.isundefined@^3.0.1:
version "3.0.1"
resolved "https://registry.yarnpkg.com/lodash.isundefined/-/lodash.isundefined-3.0.1.tgz#23ef3d9535565203a66cefd5b830f848911afb48"
integrity sha512-MXB1is3s899/cD8jheYYE2V9qTHwKvt+npCwpD+1Sxm3Q3cECXCiYHjeHWXNwr6Q0SOBPrYUDxendrO6goVTEA==
lodash.merge@^4.6.2:
version "4.6.2"
resolved "https://registry.yarnpkg.com/lodash.merge/-/lodash.merge-4.6.2.tgz#558aa53b43b661e1925a0afdfa36a9a1085fe57a"
@@ -11010,6 +11047,11 @@ lodash.sortby@^4.7.0:
resolved "https://registry.yarnpkg.com/lodash.sortby/-/lodash.sortby-4.7.0.tgz#edd14c824e2cc9c1e0b0a1b42bb5210516a42438"
integrity "sha1-7dFMgk4sycHgsKG0K7UhBRakJDg= sha512-HDWXG8isMntAyRF5vZ7xKuEvOhT4AhlRt/3czTSjvGUxjYCBVRQY48ViDHyfYz9VIoBkW4TMGQNapx+l3RUwdA=="
lodash.uniq@^4.5.0:
version "4.5.0"
resolved "https://registry.yarnpkg.com/lodash.uniq/-/lodash.uniq-4.5.0.tgz#d0225373aeb652adc1bc82e4945339a842754773"
integrity sha512-xfBaXQd9ryd9dlSDvnvI0lvxfLJlYAZzXomUYzLKtUeOQvOP5piqAWuGtrhWeqaXK9hhoM/iyJc5AV+XfsX3HQ==
lodash@4.17.21, lodash@^4.0.1, lodash@^4.17.11, lodash@^4.17.15, lodash@^4.17.20, lodash@^4.17.21, lodash@^4.17.4:
version "4.17.21"
resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.21.tgz#679591c564c3bffaae8454cf0b3df370c3d6911c"