diff --git a/package-lock.json b/package-lock.json index ef24dee..0a581c8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,10 +11,10 @@ "dependencies": { "axios": "^1.7.7", "body-parser": "^1.20.3", - "csvtojson": "^2.0.10", "dmn-moddle": "^10.0.0", "dotenv": "^16.4.5", "express": "^4.21.0", + "fast-csv": "^5.0.1", "node-stream-zip": "^1.15.0", "tar-stream": "^3.1.7", "unzipper": "^0.12.3" @@ -42,6 +42,33 @@ "node": ">=12" } }, + "node_modules/@fast-csv/format": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/@fast-csv/format/-/format-5.0.0.tgz", + "integrity": "sha512-IyMpHwYIOGa2f0BJi6Wk55UF0oBA5urdIydoEDYxPo88LFbeb3Yr4rgpu98OAO1glUWheSnNtUgS80LE+/dqmw==", + "license": "MIT", + "dependencies": { + "lodash.escaperegexp": "^4.1.2", + "lodash.isboolean": "^3.0.3", + "lodash.isequal": "^4.5.0", + "lodash.isfunction": "^3.0.9", + "lodash.isnil": "^4.0.0" + } + }, + "node_modules/@fast-csv/parse": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/@fast-csv/parse/-/parse-5.0.0.tgz", + "integrity": "sha512-ecF8tCm3jVxeRjEB6VPzmA+1wGaJ5JgaUX2uesOXdXD6qQp0B3EdshOIed4yT1Xlj/F2f8v4zHSo0Oi31L697g==", + "license": "MIT", + "dependencies": { + "lodash.escaperegexp": "^4.1.2", + "lodash.groupby": "^4.6.0", + "lodash.isfunction": "^3.0.9", + "lodash.isnil": "^4.0.0", + "lodash.isundefined": "^3.0.1", + "lodash.uniq": "^4.5.0" + } + }, "node_modules/@jridgewell/resolve-uri": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", @@ -427,23 +454,6 @@ "dev": true, "license": "MIT" }, - "node_modules/csvtojson": { - "version": "2.0.10", - "resolved": "https://registry.npmjs.org/csvtojson/-/csvtojson-2.0.10.tgz", - "integrity": "sha512-lUWFxGKyhraKCW8Qghz6Z0f2l/PqB1W3AO0HKJzGIQ5JRSlR651ekJDiGJbBT4sRNNv5ddnSGVEnsxP9XRCVpQ==", - "license": "MIT", - "dependencies": { - "bluebird": "^3.5.1", - "lodash": "^4.17.3", - "strip-bom": "^2.0.0" - }, - "bin": { - "csvtojson": "bin/csvtojson" - }, - "engines": { - "node": ">=4.0.0" - } - }, "node_modules/debug": { "version": "2.6.9", "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", @@ -633,6 +643,19 @@ "node": ">= 0.10.0" } }, + "node_modules/fast-csv": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/fast-csv/-/fast-csv-5.0.1.tgz", + "integrity": "sha512-Q43zC4NdQD5MAWOVQOF8KA+D6ddvTJjX2ib8zqysm74jZhtk6+dc8C75/OqRV6Y9CLc4kgvbC3PLG8YL4YZfgw==", + "license": "MIT", + "dependencies": { + "@fast-csv/format": "5.0.0", + "@fast-csv/parse": "5.0.0" + }, + "engines": { + "node": ">=10.0.0" + } + }, "node_modules/fast-fifo": { "version": "1.3.2", "resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz", @@ -860,12 +883,6 @@ "node": ">= 0.10" } }, - "node_modules/is-utf8": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/is-utf8/-/is-utf8-0.2.1.tgz", - "integrity": "sha512-rMYPYvCzsXywIsldgLaSoPlw5PfoB/ssr7hY4pLfcodrA5M/eArza1a9VmTiNIBNMjOGr1Ow9mTyU2o69U6U9Q==", - "license": "MIT" - }, "node_modules/isarray": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", @@ -884,10 +901,52 @@ "graceful-fs": "^4.1.6" } }, - "node_modules/lodash": { - "version": "4.17.21", - "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", - "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==", + "node_modules/lodash.escaperegexp": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/lodash.escaperegexp/-/lodash.escaperegexp-4.1.2.tgz", + "integrity": "sha512-TM9YBvyC84ZxE3rgfefxUWiQKLilstD6k7PTGt6wfbtXF8ixIJLOL3VYyV/z+ZiPLsVxAsKAFVwWlWeb2Y8Yyw==", + "license": "MIT" + }, + "node_modules/lodash.groupby": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/lodash.groupby/-/lodash.groupby-4.6.0.tgz", + "integrity": "sha512-5dcWxm23+VAoz+awKmBaiBvzox8+RqMgFhi7UvX9DHZr2HdxHXM/Wrf8cfKpsW37RNrvtPn6hSwNqurSILbmJw==", + "license": "MIT" + }, + "node_modules/lodash.isboolean": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/lodash.isboolean/-/lodash.isboolean-3.0.3.tgz", + "integrity": "sha512-Bz5mupy2SVbPHURB98VAcw+aHh4vRV5IPNhILUCsOzRmsTmSQ17jIuqopAentWoehktxGd9e/hbIXq980/1QJg==", + "license": "MIT" + }, + "node_modules/lodash.isequal": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.isequal/-/lodash.isequal-4.5.0.tgz", + "integrity": "sha512-pDo3lu8Jhfjqls6GkMgpahsF9kCyayhgykjyLMNFTKWrpVdAQtYyB4muAMWozBB4ig/dtWAmsMxLEI8wuz+DYQ==", + "license": "MIT" + }, + "node_modules/lodash.isfunction": { + "version": "3.0.9", + "resolved": "https://registry.npmjs.org/lodash.isfunction/-/lodash.isfunction-3.0.9.tgz", + "integrity": "sha512-AirXNj15uRIMMPihnkInB4i3NHeb4iBtNg9WRWuK2o31S+ePwwNmDPaTL3o7dTJ+VXNZim7rFs4rxN4YU1oUJw==", + "license": "MIT" + }, + "node_modules/lodash.isnil": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/lodash.isnil/-/lodash.isnil-4.0.0.tgz", + "integrity": "sha512-up2Mzq3545mwVnMhTDMdfoG1OurpA/s5t88JmQX809eH3C8491iu2sfKhTfhQtKY78oPNhiaHJUpT/dUDAAtng==", + "license": "MIT" + }, + "node_modules/lodash.isundefined": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/lodash.isundefined/-/lodash.isundefined-3.0.1.tgz", + "integrity": "sha512-MXB1is3s899/cD8jheYYE2V9qTHwKvt+npCwpD+1Sxm3Q3cECXCiYHjeHWXNwr6Q0SOBPrYUDxendrO6goVTEA==", + "license": "MIT" + }, + "node_modules/lodash.uniq": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.uniq/-/lodash.uniq-4.5.0.tgz", + "integrity": "sha512-xfBaXQd9ryd9dlSDvnvI0lvxfLJlYAZzXomUYzLKtUeOQvOP5piqAWuGtrhWeqaXK9hhoM/iyJc5AV+XfsX3HQ==", "license": "MIT" }, "node_modules/make-error": { @@ -1328,18 +1387,6 @@ "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", "license": "MIT" }, - "node_modules/strip-bom": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-2.0.0.tgz", - "integrity": "sha512-kwrX1y7czp1E69n2ajbG65mIo9dqvJ+8aBQXOGVxqwvNbsXdFM6Lq37dLAY3mknUwru8CfcCbfOLL/gMo+fi3g==", - "license": "MIT", - "dependencies": { - "is-utf8": "^0.2.0" - }, - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/tar-stream": { "version": "3.1.7", "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz", diff --git a/package.json b/package.json index 05f0751..790d462 100644 --- a/package.json +++ b/package.json @@ -17,10 +17,10 @@ "dependencies": { "axios": "^1.7.7", "body-parser": "^1.20.3", - "csvtojson": "^2.0.10", "dmn-moddle": "^10.0.0", "dotenv": "^16.4.5", "express": "^4.21.0", + "fast-csv": "^5.0.1", "node-stream-zip": "^1.15.0", "tar-stream": "^3.1.7", "unzipper": "^0.12.3" diff --git a/src/services/FileService.ts b/src/services/FileService.ts index 947d4f2..aa75e03 100644 --- a/src/services/FileService.ts +++ b/src/services/FileService.ts @@ -10,11 +10,9 @@ class FileService { * @return Promise - The compressed file stream */ public static async getFileStream(url: string): Promise { - return axios({ - method: "GET", - url: url, - responseType: "stream", - }).then((response) => response.data); + return axios({ method: "GET", url, responseType: "stream" }).then( + (response) => response.data + ); } /** diff --git a/src/services/dataset/Dataset.ts b/src/services/dataset/Dataset.ts index 8b7ab45..f691a02 100644 --- a/src/services/dataset/Dataset.ts +++ b/src/services/dataset/Dataset.ts @@ -11,13 +11,18 @@ import { ParserFactory } from "../parser"; import { DatasetType } from "./"; import { Data, DataConstructor } from "../data"; -type DatasetOptions = { +type DatasetParams = { id: string; dataType: DataConstructor; source: string; file: string; archiveType: ArchiveType; datasetType: DatasetType; + options?: DatasetOptions; +}; + +type DatasetOptions = { + parser?: any; }; /** @@ -31,6 +36,7 @@ class Dataset { readonly datasetType: DatasetType; readonly cachePath: string; private dataType: DataConstructor; + private options?: DatasetOptions; /** * Create a new dataset instance @@ -40,6 +46,7 @@ class Dataset { * @param dataType - The constructor of the data class * @param archiveType - The type of the archive * @param datasetType - The type of the dataset + * @param options - Additional options for the dataset */ constructor({ id, @@ -48,13 +55,15 @@ class Dataset { dataType, archiveType, datasetType, - }: DatasetOptions) { + options, + }: DatasetParams) { this.id = id; this.dataType = dataType; this.source = source; this.file = file; this.archiveType = archiveType; this.datasetType = datasetType; + this.options = options; this.cachePath = CacheService.getCachePath(this.source, ".json"); } @@ -79,18 +88,27 @@ class Dataset { await pipelineAsync( await FileService.getFileStream(this.source), archive.extract(this.file), - parser.parse(), + parser.parse(this.options?.parser), Dataset.transformToData(this.dataType), FileService.createWriteStream(this.cachePath) - ); + ) + .then(() => { + console.log(`Loaded: ${this.source}`); + }) + .catch((err) => { + console.error(`Failed to load dataset: ${this.source}`); + FileService.deleteFile(this.cachePath); + throw err; + }); } private static transformToData(dataType: DataConstructor): Transform { return new Transform({ objectMode: true, transform(chunk: object, _, callback) { - const data: Data = new dataType(JSON.parse(chunk.toString())); + const data: Data = new dataType(chunk); this.push(JSON.stringify(data) + "\n"); + callback(null, JSON.stringify(data) + "\n"); }, }); diff --git a/src/services/dataset/DatasetCollection.ts b/src/services/dataset/DatasetCollection.ts index df94cdc..c1185e7 100644 --- a/src/services/dataset/DatasetCollection.ts +++ b/src/services/dataset/DatasetCollection.ts @@ -12,6 +12,11 @@ class DatasetCollection { dataType: NudgerData, archiveType: ArchiveType.ZIP, datasetType: DatasetType.CSV, + options: { + parser: { + delimiter: ",", + }, + }, }), new Dataset({ id: "openfoodfacts", @@ -21,6 +26,12 @@ class DatasetCollection { dataType: OpenFoodFactsData, archiveType: ArchiveType.GZIP, datasetType: DatasetType.CSV, + options: { + parser: { + delimiter: "\t", + quote: null, + }, + }, }), ]; diff --git a/src/services/parser/CsvParser.ts b/src/services/parser/CsvParser.ts index 073e326..52c6a85 100644 --- a/src/services/parser/CsvParser.ts +++ b/src/services/parser/CsvParser.ts @@ -1,14 +1,21 @@ import { Parser } from "./"; import { Duplex } from "node:stream"; -import csv from "csvtojson"; +// import csv from "csvtojson"; +import * as csv from "fast-csv"; class CsvParser implements Parser { public static instance: CsvParser = new CsvParser(); - public parse(): Duplex { - return csv({ - delimiter: "auto", + public parse(options: any): Duplex { + return csv.parse({ + headers: true, + objectMode: true, + trim: true, + ...options, }); + // return csv({ + // delimiter: "auto", + // }); } } diff --git a/src/services/parser/Parser.ts b/src/services/parser/Parser.ts index 2f13169..7c16ebb 100644 --- a/src/services/parser/Parser.ts +++ b/src/services/parser/Parser.ts @@ -4,7 +4,7 @@ interface Parser { /** * Parse the content of the stream into JSON objects */ - parse(): Duplex; + parse(options: any): Duplex; } export default Parser;