feat: Add World-Cities Dataset

This commit is contained in:
Lucàs
2024-10-09 17:06:54 +02:00
parent 93ee52ddc2
commit 18927b5255
16 changed files with 100 additions and 26 deletions
+27
View File
@@ -15,12 +15,14 @@
"dotenv": "^16.4.5", "dotenv": "^16.4.5",
"express": "^4.21.0", "express": "^4.21.0",
"fast-csv": "^5.0.1", "fast-csv": "^5.0.1",
"js-yaml": "^4.1.0",
"node-stream-zip": "^1.15.0", "node-stream-zip": "^1.15.0",
"tar-stream": "^3.1.7", "tar-stream": "^3.1.7",
"unzipper": "^0.12.3" "unzipper": "^0.12.3"
}, },
"devDependencies": { "devDependencies": {
"@types/express": "^4.17.21", "@types/express": "^4.17.21",
"@types/js-yaml": "^4.0.9",
"@types/node": "^22.5.5", "@types/node": "^22.5.5",
"@types/tar-stream": "^3.1.3", "@types/tar-stream": "^3.1.3",
"@types/unzipper": "^0.10.10", "@types/unzipper": "^0.10.10",
@@ -179,6 +181,13 @@
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },
"node_modules/@types/js-yaml": {
"version": "4.0.9",
"resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz",
"integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==",
"dev": true,
"license": "MIT"
},
"node_modules/@types/mime": { "node_modules/@types/mime": {
"version": "1.3.5", "version": "1.3.5",
"resolved": "https://registry.npmjs.org/@types/mime/-/mime-1.3.5.tgz", "resolved": "https://registry.npmjs.org/@types/mime/-/mime-1.3.5.tgz",
@@ -299,6 +308,12 @@
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },
"node_modules/argparse": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
"integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
"license": "Python-2.0"
},
"node_modules/array-flatten": { "node_modules/array-flatten": {
"version": "1.1.1", "version": "1.1.1",
"resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
@@ -889,6 +904,18 @@
"integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/js-yaml": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz",
"integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==",
"license": "MIT",
"dependencies": {
"argparse": "^2.0.1"
},
"bin": {
"js-yaml": "bin/js-yaml.js"
}
},
"node_modules/jsonfile": { "node_modules/jsonfile": {
"version": "6.1.0", "version": "6.1.0",
"resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.1.0.tgz", "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.1.0.tgz",
+2
View File
@@ -21,12 +21,14 @@
"dotenv": "^16.4.5", "dotenv": "^16.4.5",
"express": "^4.21.0", "express": "^4.21.0",
"fast-csv": "^5.0.1", "fast-csv": "^5.0.1",
"js-yaml": "^4.1.0",
"node-stream-zip": "^1.15.0", "node-stream-zip": "^1.15.0",
"tar-stream": "^3.1.7", "tar-stream": "^3.1.7",
"unzipper": "^0.12.3" "unzipper": "^0.12.3"
}, },
"devDependencies": { "devDependencies": {
"@types/express": "^4.17.21", "@types/express": "^4.17.21",
"@types/js-yaml": "^4.0.9",
"@types/node": "^22.5.5", "@types/node": "^22.5.5",
"@types/tar-stream": "^3.1.3", "@types/tar-stream": "^3.1.3",
"@types/unzipper": "^0.10.10", "@types/unzipper": "^0.10.10",
+2 -1
View File
@@ -1,9 +1,10 @@
import { Archive, ZipArchive, ArchiveType, GzipArchive } from "./"; import { Archive, ZipArchive, ArchiveType, GzipArchive, NoneArchive } from "./";
class ArchiveFactory { class ArchiveFactory {
static getArchive(archiveType: ArchiveType): Archive { static getArchive(archiveType: ArchiveType): Archive {
if (archiveType === ArchiveType.ZIP) return ZipArchive.instance; if (archiveType === ArchiveType.ZIP) return ZipArchive.instance;
if (archiveType === ArchiveType.GZIP) return GzipArchive.instance; if (archiveType === ArchiveType.GZIP) return GzipArchive.instance;
if (archiveType === ArchiveType.NONE) return NoneArchive.instance;
throw new Error("Unsupported archive type"); throw new Error("Unsupported archive type");
} }
+1
View File
@@ -1,6 +1,7 @@
enum ArchiveType { enum ArchiveType {
ZIP = ".zip", ZIP = ".zip",
GZIP = ".gzip", GZIP = ".gzip",
NONE = "",
} }
export default ArchiveType; export default ArchiveType;
+16
View File
@@ -0,0 +1,16 @@
import { Archive } from "./";
import { Duplex, Transform } from "node:stream";
class NoneArchive implements Archive {
public static instance: Archive = new NoneArchive();
public extract(source: string): Duplex {
return new Transform({
transform(chunk, _, callback) {
callback(null, chunk);
},
});
}
}
export default NoneArchive;
+1
View File
@@ -5,3 +5,4 @@ export { default as ArchiveFactory } from "./ArchiveFactory";
export { default as ZipArchive } from "./ZipArchive"; export { default as ZipArchive } from "./ZipArchive";
export { default as GzipArchive } from "./GzipArchive"; export { default as GzipArchive } from "./GzipArchive";
export { default as NoneArchive } from "./NoneArchive";
+20
View File
@@ -0,0 +1,20 @@
import { Data } from "./";
type RawSmolaData = {
name: string;
country: string;
subcountry: string;
geonameid: string;
};
class WorldCitiesData implements Data {
input: string[];
output: string[];
constructor({ geonameid, country }: RawSmolaData) {
this.input = [geonameid];
this.output = [country];
}
}
export default WorldCitiesData;
+1
View File
@@ -2,3 +2,4 @@ export { default as Data, DataConstructor } from "./Data";
export { default as NudgerData } from "./NudgerData"; export { default as NudgerData } from "./NudgerData";
export { default as OpenFoodFactsData } from "./OpenFoodFactsData"; export { default as OpenFoodFactsData } from "./OpenFoodFactsData";
export { default as WorldCitiesData } from "./WorldCitiesData";
+6 -7
View File
@@ -7,8 +7,7 @@ import CacheService from "../CacheService";
import FileService from "../FileService"; import FileService from "../FileService";
import { ArchiveFactory, ArchiveType } from "../archive"; import { ArchiveFactory, ArchiveType } from "../archive";
import { ParserFactory } from "../parser"; import { ParserFactory, ParserType } from "../parser";
import { DatasetType } from "./";
import { Data, DataConstructor } from "../data"; import { Data, DataConstructor } from "../data";
type DatasetParams = { type DatasetParams = {
@@ -17,7 +16,7 @@ type DatasetParams = {
source: string; source: string;
file: string; file: string;
archiveType: ArchiveType; archiveType: ArchiveType;
datasetType: DatasetType; parserType: ParserType;
options?: DatasetOptions; options?: DatasetOptions;
}; };
@@ -33,7 +32,7 @@ class Dataset {
readonly source: string; readonly source: string;
readonly file: string; readonly file: string;
readonly archiveType: ArchiveType; readonly archiveType: ArchiveType;
readonly datasetType: DatasetType; readonly parserType: ParserType;
readonly cachePath: string; readonly cachePath: string;
private dataType: DataConstructor<Data>; private dataType: DataConstructor<Data>;
private options?: DatasetOptions; private options?: DatasetOptions;
@@ -54,7 +53,7 @@ class Dataset {
file, file,
dataType, dataType,
archiveType, archiveType,
datasetType, parserType,
options, options,
}: DatasetParams) { }: DatasetParams) {
this.id = id; this.id = id;
@@ -62,7 +61,7 @@ class Dataset {
this.source = source; this.source = source;
this.file = file; this.file = file;
this.archiveType = archiveType; this.archiveType = archiveType;
this.datasetType = datasetType; this.parserType = parserType;
this.options = options; this.options = options;
this.cachePath = CacheService.getCachePath(this.source, ".json"); this.cachePath = CacheService.getCachePath(this.source, ".json");
@@ -80,7 +79,7 @@ class Dataset {
} }
const archive = ArchiveFactory.getArchive(this.archiveType); const archive = ArchiveFactory.getArchive(this.archiveType);
const parser = ParserFactory.getParser(this.datasetType); const parser = ParserFactory.getParser(this.parserType);
const pipelineAsync = promisify(pipeline); const pipelineAsync = promisify(pipeline);
+14 -4
View File
@@ -1,6 +1,7 @@
import { NudgerData, OpenFoodFactsData } from "../data"; import { NudgerData, OpenFoodFactsData, WorldCitiesData } from "../data";
import { ArchiveType } from "../archive"; import { ArchiveType } from "../archive";
import { Dataset, DatasetType } from "./"; import { Dataset } from "./";
import { ParserType } from "../parser";
class DatasetCollection { class DatasetCollection {
public static datasets: Dataset[] = [ public static datasets: Dataset[] = [
@@ -11,7 +12,7 @@ class DatasetCollection {
file: "open4goods-full-gtin-dataset.csv", file: "open4goods-full-gtin-dataset.csv",
dataType: NudgerData, dataType: NudgerData,
archiveType: ArchiveType.ZIP, archiveType: ArchiveType.ZIP,
datasetType: DatasetType.CSV, parserType: ParserType.CSV,
options: { options: {
parser: { parser: {
delimiter: ",", delimiter: ",",
@@ -25,7 +26,7 @@ class DatasetCollection {
file: "en.openfoodfacts.org.products.csv", file: "en.openfoodfacts.org.products.csv",
dataType: OpenFoodFactsData, dataType: OpenFoodFactsData,
archiveType: ArchiveType.GZIP, archiveType: ArchiveType.GZIP,
datasetType: DatasetType.CSV, parserType: ParserType.CSV,
options: { options: {
parser: { parser: {
delimiter: "\t", delimiter: "\t",
@@ -33,6 +34,15 @@ class DatasetCollection {
}, },
}, },
}), }),
new Dataset({
id: "world-cities",
source:
"https://raw.githubusercontent.com/datasets/world-cities/refs/heads/main/data/world-cities.csv",
file: "world-cities.csv",
dataType: WorldCitiesData,
archiveType: ArchiveType.NONE,
parserType: ParserType.CSV,
}),
]; ];
public static loadAll(): Promise<void[]> { public static loadAll(): Promise<void[]> {
-5
View File
@@ -1,5 +0,0 @@
enum DatasetType {
CSV = ".csv",
}
export default DatasetType;
-2
View File
@@ -1,4 +1,2 @@
export { default as DatasetType } from "./DatasetType";
export { default as Dataset } from "./Dataset"; export { default as Dataset } from "./Dataset";
export { default as DatasetCollection } from "./DatasetCollection"; export { default as DatasetCollection } from "./DatasetCollection";
-4
View File
@@ -1,6 +1,5 @@
import { Parser } from "./"; import { Parser } from "./";
import { Duplex } from "node:stream"; import { Duplex } from "node:stream";
// import csv from "csvtojson";
import * as csv from "fast-csv"; import * as csv from "fast-csv";
class CsvParser implements Parser { class CsvParser implements Parser {
@@ -13,9 +12,6 @@ class CsvParser implements Parser {
trim: true, trim: true,
...options, ...options,
}); });
// return csv({
// delimiter: "auto",
// });
} }
} }
+3 -3
View File
@@ -1,5 +1,5 @@
import { Parser, CsvParser } from "./"; import { Parser, CsvParser } from "./";
import { DatasetType } from "../dataset"; import { ParserType } from ".";
class ParserFactory { class ParserFactory {
/** /**
@@ -7,8 +7,8 @@ class ParserFactory {
* @param fileType The type of the dataset * @param fileType The type of the dataset
* @returns The parser corresponding to the dataset type * @returns The parser corresponding to the dataset type
*/ */
static getParser(fileType: DatasetType): Parser { static getParser(fileType: ParserType): Parser {
if (fileType === DatasetType.CSV) return CsvParser.instance; if (fileType === ParserType.CSV) return CsvParser.instance;
throw new Error("Unsupported file type"); throw new Error("Unsupported file type");
} }
} }
+5
View File
@@ -0,0 +1,5 @@
enum ParserType {
CSV = ".csv",
}
export default ParserType;
+2
View File
@@ -1,3 +1,5 @@
export { default as ParserType } from "./ParserType";
export { default as ParserFactory } from "./ParserFactory"; export { default as ParserFactory } from "./ParserFactory";
export { default as Parser } from "./Parser"; export { default as Parser } from "./Parser";