From 18927b525542861f6504a7f0d20a7fc45032c204 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luc=C3=A0s?= <86352901+LucasVbr@users.noreply.github.com> Date: Wed, 9 Oct 2024 17:06:54 +0200 Subject: [PATCH] feat: Add World-Cities Dataset --- package-lock.json | 27 +++++++++++++++++++++++ package.json | 2 ++ src/services/archive/ArchiveFactory.ts | 3 ++- src/services/archive/ArchiveType.ts | 1 + src/services/archive/NoneArchive.ts | 16 ++++++++++++++ src/services/archive/index.ts | 1 + src/services/data/WorldCitiesData.ts | 20 +++++++++++++++++ src/services/data/index.ts | 1 + src/services/dataset/Dataset.ts | 13 +++++------ src/services/dataset/DatasetCollection.ts | 18 +++++++++++---- src/services/dataset/DatasetType.ts | 5 ----- src/services/dataset/index.ts | 2 -- src/services/parser/CsvParser.ts | 4 ---- src/services/parser/ParserFactory.ts | 6 ++--- src/services/parser/ParserType.ts | 5 +++++ src/services/parser/index.ts | 2 ++ 16 files changed, 100 insertions(+), 26 deletions(-) create mode 100644 src/services/archive/NoneArchive.ts create mode 100644 src/services/data/WorldCitiesData.ts delete mode 100644 src/services/dataset/DatasetType.ts create mode 100644 src/services/parser/ParserType.ts diff --git a/package-lock.json b/package-lock.json index 0a581c8..aefe585 100644 --- a/package-lock.json +++ b/package-lock.json @@ -15,12 +15,14 @@ "dotenv": "^16.4.5", "express": "^4.21.0", "fast-csv": "^5.0.1", + "js-yaml": "^4.1.0", "node-stream-zip": "^1.15.0", "tar-stream": "^3.1.7", "unzipper": "^0.12.3" }, "devDependencies": { "@types/express": "^4.17.21", + "@types/js-yaml": "^4.0.9", "@types/node": "^22.5.5", "@types/tar-stream": "^3.1.3", "@types/unzipper": "^0.10.10", @@ -179,6 +181,13 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/js-yaml": { + "version": "4.0.9", + "resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz", + "integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/mime": { "version": "1.3.5", "resolved": "https://registry.npmjs.org/@types/mime/-/mime-1.3.5.tgz", @@ -299,6 +308,12 @@ "dev": true, "license": "MIT" }, + "node_modules/argparse": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", + "license": "Python-2.0" + }, "node_modules/array-flatten": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", @@ -889,6 +904,18 @@ "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", "license": "MIT" }, + "node_modules/js-yaml": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz", + "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==", + "license": "MIT", + "dependencies": { + "argparse": "^2.0.1" + }, + "bin": { + "js-yaml": "bin/js-yaml.js" + } + }, "node_modules/jsonfile": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.1.0.tgz", diff --git a/package.json b/package.json index 790d462..8f3479f 100644 --- a/package.json +++ b/package.json @@ -21,12 +21,14 @@ "dotenv": "^16.4.5", "express": "^4.21.0", "fast-csv": "^5.0.1", + "js-yaml": "^4.1.0", "node-stream-zip": "^1.15.0", "tar-stream": "^3.1.7", "unzipper": "^0.12.3" }, "devDependencies": { "@types/express": "^4.17.21", + "@types/js-yaml": "^4.0.9", "@types/node": "^22.5.5", "@types/tar-stream": "^3.1.3", "@types/unzipper": "^0.10.10", diff --git a/src/services/archive/ArchiveFactory.ts b/src/services/archive/ArchiveFactory.ts index 1b061d3..215d115 100644 --- a/src/services/archive/ArchiveFactory.ts +++ b/src/services/archive/ArchiveFactory.ts @@ -1,9 +1,10 @@ -import { Archive, ZipArchive, ArchiveType, GzipArchive } from "./"; +import { Archive, ZipArchive, ArchiveType, GzipArchive, NoneArchive } from "./"; class ArchiveFactory { static getArchive(archiveType: ArchiveType): Archive { if (archiveType === ArchiveType.ZIP) return ZipArchive.instance; if (archiveType === ArchiveType.GZIP) return GzipArchive.instance; + if (archiveType === ArchiveType.NONE) return NoneArchive.instance; throw new Error("Unsupported archive type"); } diff --git a/src/services/archive/ArchiveType.ts b/src/services/archive/ArchiveType.ts index ee9f365..7bacea6 100644 --- a/src/services/archive/ArchiveType.ts +++ b/src/services/archive/ArchiveType.ts @@ -1,6 +1,7 @@ enum ArchiveType { ZIP = ".zip", GZIP = ".gzip", + NONE = "", } export default ArchiveType; diff --git a/src/services/archive/NoneArchive.ts b/src/services/archive/NoneArchive.ts new file mode 100644 index 0000000..0c1465e --- /dev/null +++ b/src/services/archive/NoneArchive.ts @@ -0,0 +1,16 @@ +import { Archive } from "./"; +import { Duplex, Transform } from "node:stream"; + +class NoneArchive implements Archive { + public static instance: Archive = new NoneArchive(); + + public extract(source: string): Duplex { + return new Transform({ + transform(chunk, _, callback) { + callback(null, chunk); + }, + }); + } +} + +export default NoneArchive; diff --git a/src/services/archive/index.ts b/src/services/archive/index.ts index f6562c9..46c66bf 100644 --- a/src/services/archive/index.ts +++ b/src/services/archive/index.ts @@ -5,3 +5,4 @@ export { default as ArchiveFactory } from "./ArchiveFactory"; export { default as ZipArchive } from "./ZipArchive"; export { default as GzipArchive } from "./GzipArchive"; +export { default as NoneArchive } from "./NoneArchive"; diff --git a/src/services/data/WorldCitiesData.ts b/src/services/data/WorldCitiesData.ts new file mode 100644 index 0000000..630a3d5 --- /dev/null +++ b/src/services/data/WorldCitiesData.ts @@ -0,0 +1,20 @@ +import { Data } from "./"; + +type RawSmolaData = { + name: string; + country: string; + subcountry: string; + geonameid: string; +}; + +class WorldCitiesData implements Data { + input: string[]; + output: string[]; + + constructor({ geonameid, country }: RawSmolaData) { + this.input = [geonameid]; + this.output = [country]; + } +} + +export default WorldCitiesData; diff --git a/src/services/data/index.ts b/src/services/data/index.ts index 975abd8..044db4e 100644 --- a/src/services/data/index.ts +++ b/src/services/data/index.ts @@ -2,3 +2,4 @@ export { default as Data, DataConstructor } from "./Data"; export { default as NudgerData } from "./NudgerData"; export { default as OpenFoodFactsData } from "./OpenFoodFactsData"; +export { default as WorldCitiesData } from "./WorldCitiesData"; diff --git a/src/services/dataset/Dataset.ts b/src/services/dataset/Dataset.ts index f691a02..8a9df20 100644 --- a/src/services/dataset/Dataset.ts +++ b/src/services/dataset/Dataset.ts @@ -7,8 +7,7 @@ import CacheService from "../CacheService"; import FileService from "../FileService"; import { ArchiveFactory, ArchiveType } from "../archive"; -import { ParserFactory } from "../parser"; -import { DatasetType } from "./"; +import { ParserFactory, ParserType } from "../parser"; import { Data, DataConstructor } from "../data"; type DatasetParams = { @@ -17,7 +16,7 @@ type DatasetParams = { source: string; file: string; archiveType: ArchiveType; - datasetType: DatasetType; + parserType: ParserType; options?: DatasetOptions; }; @@ -33,7 +32,7 @@ class Dataset { readonly source: string; readonly file: string; readonly archiveType: ArchiveType; - readonly datasetType: DatasetType; + readonly parserType: ParserType; readonly cachePath: string; private dataType: DataConstructor; private options?: DatasetOptions; @@ -54,7 +53,7 @@ class Dataset { file, dataType, archiveType, - datasetType, + parserType, options, }: DatasetParams) { this.id = id; @@ -62,7 +61,7 @@ class Dataset { this.source = source; this.file = file; this.archiveType = archiveType; - this.datasetType = datasetType; + this.parserType = parserType; this.options = options; this.cachePath = CacheService.getCachePath(this.source, ".json"); @@ -80,7 +79,7 @@ class Dataset { } const archive = ArchiveFactory.getArchive(this.archiveType); - const parser = ParserFactory.getParser(this.datasetType); + const parser = ParserFactory.getParser(this.parserType); const pipelineAsync = promisify(pipeline); diff --git a/src/services/dataset/DatasetCollection.ts b/src/services/dataset/DatasetCollection.ts index c1185e7..0f8a4b0 100644 --- a/src/services/dataset/DatasetCollection.ts +++ b/src/services/dataset/DatasetCollection.ts @@ -1,6 +1,7 @@ -import { NudgerData, OpenFoodFactsData } from "../data"; +import { NudgerData, OpenFoodFactsData, WorldCitiesData } from "../data"; import { ArchiveType } from "../archive"; -import { Dataset, DatasetType } from "./"; +import { Dataset } from "./"; +import { ParserType } from "../parser"; class DatasetCollection { public static datasets: Dataset[] = [ @@ -11,7 +12,7 @@ class DatasetCollection { file: "open4goods-full-gtin-dataset.csv", dataType: NudgerData, archiveType: ArchiveType.ZIP, - datasetType: DatasetType.CSV, + parserType: ParserType.CSV, options: { parser: { delimiter: ",", @@ -25,7 +26,7 @@ class DatasetCollection { file: "en.openfoodfacts.org.products.csv", dataType: OpenFoodFactsData, archiveType: ArchiveType.GZIP, - datasetType: DatasetType.CSV, + parserType: ParserType.CSV, options: { parser: { delimiter: "\t", @@ -33,6 +34,15 @@ class DatasetCollection { }, }, }), + new Dataset({ + id: "world-cities", + source: + "https://raw.githubusercontent.com/datasets/world-cities/refs/heads/main/data/world-cities.csv", + file: "world-cities.csv", + dataType: WorldCitiesData, + archiveType: ArchiveType.NONE, + parserType: ParserType.CSV, + }), ]; public static loadAll(): Promise { diff --git a/src/services/dataset/DatasetType.ts b/src/services/dataset/DatasetType.ts deleted file mode 100644 index 19a7ecc..0000000 --- a/src/services/dataset/DatasetType.ts +++ /dev/null @@ -1,5 +0,0 @@ -enum DatasetType { - CSV = ".csv", -} - -export default DatasetType; diff --git a/src/services/dataset/index.ts b/src/services/dataset/index.ts index f300450..85af266 100644 --- a/src/services/dataset/index.ts +++ b/src/services/dataset/index.ts @@ -1,4 +1,2 @@ -export { default as DatasetType } from "./DatasetType"; - export { default as Dataset } from "./Dataset"; export { default as DatasetCollection } from "./DatasetCollection"; diff --git a/src/services/parser/CsvParser.ts b/src/services/parser/CsvParser.ts index 52c6a85..46d58a3 100644 --- a/src/services/parser/CsvParser.ts +++ b/src/services/parser/CsvParser.ts @@ -1,6 +1,5 @@ import { Parser } from "./"; import { Duplex } from "node:stream"; -// import csv from "csvtojson"; import * as csv from "fast-csv"; class CsvParser implements Parser { @@ -13,9 +12,6 @@ class CsvParser implements Parser { trim: true, ...options, }); - // return csv({ - // delimiter: "auto", - // }); } } diff --git a/src/services/parser/ParserFactory.ts b/src/services/parser/ParserFactory.ts index aff7853..c57a139 100644 --- a/src/services/parser/ParserFactory.ts +++ b/src/services/parser/ParserFactory.ts @@ -1,5 +1,5 @@ import { Parser, CsvParser } from "./"; -import { DatasetType } from "../dataset"; +import { ParserType } from "."; class ParserFactory { /** @@ -7,8 +7,8 @@ class ParserFactory { * @param fileType The type of the dataset * @returns The parser corresponding to the dataset type */ - static getParser(fileType: DatasetType): Parser { - if (fileType === DatasetType.CSV) return CsvParser.instance; + static getParser(fileType: ParserType): Parser { + if (fileType === ParserType.CSV) return CsvParser.instance; throw new Error("Unsupported file type"); } } diff --git a/src/services/parser/ParserType.ts b/src/services/parser/ParserType.ts new file mode 100644 index 0000000..447623c --- /dev/null +++ b/src/services/parser/ParserType.ts @@ -0,0 +1,5 @@ +enum ParserType { + CSV = ".csv", +} + +export default ParserType; diff --git a/src/services/parser/index.ts b/src/services/parser/index.ts index 12ea722..22c3673 100644 --- a/src/services/parser/index.ts +++ b/src/services/parser/index.ts @@ -1,3 +1,5 @@ +export { default as ParserType } from "./ParserType"; + export { default as ParserFactory } from "./ParserFactory"; export { default as Parser } from "./Parser";