From cde872ca550bb8b1a4e7d9b472ea1332cae58c3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luc=C3=A0s?= <86352901+LucasVbr@users.noreply.github.com> Date: Wed, 23 Oct 2024 17:07:23 +0200 Subject: [PATCH] =?UTF-8?q?[Draft]=20Stockage=20des=20donn=C3=A9es=20sur?= =?UTF-8?q?=20MariaDB=20avec=20TypeORM?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docker-compose.yaml | 2 +- package-lock.json | 10 ++ package.json | 1 + src/AppDataSource.ts | 9 +- src/routes/randomize.ts | 1 + src/services/CacheService.ts | 6 +- src/services/data/Data.ts | 12 +- src/services/data/NudgerData.ts | 37 ++++- src/services/data/OpenFoodFactsData.ts | 40 ++++- src/services/data/WorldCitiesData.ts | 39 ++++- src/services/data/index.ts | 2 +- src/services/dataset/Dataset.ts | 194 +++++++++++++--------- src/services/dataset/DatasetCollection.ts | 67 ++++---- src/services/dmn/DMN.ts | 47 +++++- 14 files changed, 323 insertions(+), 144 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 6076c46..1a134b4 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -3,7 +3,7 @@ version: "3.1" services: mariadb: image: mariadb:latest - restart: "always" + restart: "no" environment: MYSQL_ROOT_PASSWORD: root MYSQL_DATABASE: db diff --git a/package-lock.json b/package-lock.json index 46f91a4..0aae65d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -15,6 +15,7 @@ "dotenv": "^16.4.5", "express": "^4.21.0", "fast-csv": "^5.0.1", + "jsonschema": "^1.4.1", "mysql": "^2.18.1", "reflect-metadata": "^0.2.2", "tar-stream": "^3.1.7", @@ -1423,6 +1424,15 @@ "graceful-fs": "^4.1.6" } }, + "node_modules/jsonschema": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/jsonschema/-/jsonschema-1.4.1.tgz", + "integrity": "sha512-S6cATIPVv1z0IlxdN+zUk5EPjkGCdnhN4wVSBlvoUO1tOLJootbo9CquNJmbIh4yikWHiUedhRYrNPn1arpEmQ==", + "license": "MIT", + "engines": { + "node": "*" + } + }, "node_modules/lodash.escaperegexp": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/lodash.escaperegexp/-/lodash.escaperegexp-4.1.2.tgz", diff --git a/package.json b/package.json index 52af80f..d38d5cf 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "dotenv": "^16.4.5", "express": "^4.21.0", "fast-csv": "^5.0.1", + "jsonschema": "^1.4.1", "mysql": "^2.18.1", "reflect-metadata": "^0.2.2", "tar-stream": "^3.1.7", diff --git a/src/AppDataSource.ts b/src/AppDataSource.ts index 64cb934..6d099b3 100644 --- a/src/AppDataSource.ts +++ b/src/AppDataSource.ts @@ -1,6 +1,11 @@ import "reflect-metadata"; import { DataSource } from "typeorm"; import { Log } from "./entity/Log"; +import { + NudgerData, + OpenFoodFactsData, + WorldCitiesData, +} from "./services/data"; export const AppDataSource = new DataSource({ type: "mariadb", @@ -10,8 +15,8 @@ export const AppDataSource = new DataSource({ password: "root", database: "db", synchronize: true, - logging: true, - entities: [Log], + logging: false, + entities: [Log, NudgerData, OpenFoodFactsData, WorldCitiesData], subscribers: [], migrations: [], }); diff --git a/src/routes/randomize.ts b/src/routes/randomize.ts index 108eb5c..54e7ca6 100644 --- a/src/routes/randomize.ts +++ b/src/routes/randomize.ts @@ -49,6 +49,7 @@ router.post("/randomize/:id", async (req: Request, res: Response) => { const dmn: Definitions = await DMN.parse(req.body); const schema = DMN.getSchema(dmn); + console.log(JSON.stringify(schema, null, 2)); const data: Data[] = await dataset.get(size, schema); diff --git a/src/services/CacheService.ts b/src/services/CacheService.ts index 5117f73..0f8bc94 100644 --- a/src/services/CacheService.ts +++ b/src/services/CacheService.ts @@ -22,8 +22,8 @@ class CacheService { } } -if (!existsSync(CacheService.CACHE_DIR)) { - mkdirSync(CacheService.CACHE_DIR); -} +// if (!existsSync(CacheService.CACHE_DIR)) { +// mkdirSync(CacheService.CACHE_DIR); +// } export default CacheService; diff --git a/src/services/data/Data.ts b/src/services/data/Data.ts index 366f4b2..87c1a66 100644 --- a/src/services/data/Data.ts +++ b/src/services/data/Data.ts @@ -1,9 +1,13 @@ interface Data { - input: any[]; - output: any[]; + id?: number; } -type DataConstructor = new (...args: any[]) => T; +class InvalidData extends Error { + constructor(message: string) { + super(message); + this.name = "Invalid data"; + } +} export default Data; -export { DataConstructor }; +export { InvalidData }; diff --git a/src/services/data/NudgerData.ts b/src/services/data/NudgerData.ts index c6ddd0d..c296365 100644 --- a/src/services/data/NudgerData.ts +++ b/src/services/data/NudgerData.ts @@ -1,4 +1,5 @@ -import { Data } from "./"; +import { Data, InvalidData } from "./"; +import { Column, Entity, PrimaryGeneratedColumn } from "typeorm"; type RawNudgerData = { code: string; // "3260014791012", @@ -15,13 +16,37 @@ type RawNudgerData = { url: string; // "" }; +@Entity() class NudgerData implements Data { - input: string[]; - output: string[]; + @PrimaryGeneratedColumn({ + type: "integer", + }) + id?: number; - constructor({ code, gs1_country }: RawNudgerData) { - this.input = [code]; - this.output = [gs1_country]; + @Column() + barcode_ean_13: string; + + @Column() + country: string; + + constructor(code: string, gs1_country: string) { + this.barcode_ean_13 = code; + this.country = gs1_country; + } + + fromRaw({ code, gs1_country }: RawNudgerData): NudgerData { + if (!code || !gs1_country || code.length !== 13) { + throw new InvalidData("Invalid data"); + } + + return new NudgerData(code, gs1_country); + } + + asData(nudgerData: NudgerData): any { + return { + "Barcode (EAN 13)": nudgerData.barcode_ean_13, + Country: nudgerData.country, + }; } } diff --git a/src/services/data/OpenFoodFactsData.ts b/src/services/data/OpenFoodFactsData.ts index 1562d14..f23cd16 100644 --- a/src/services/data/OpenFoodFactsData.ts +++ b/src/services/data/OpenFoodFactsData.ts @@ -1,17 +1,45 @@ -import { Data } from "./"; +import { Data, InvalidData } from "./"; +import { Column, Entity, PrimaryColumn, PrimaryGeneratedColumn } from "typeorm"; type RawOpenFoodFactsData = { code: string; countries_en: string; }; +@Entity() class OpenFoodFactsData implements Data { - input: string[] = []; - output: string[] = []; + @PrimaryGeneratedColumn({ + type: "integer", + }) + id?: number; - constructor({ code, countries_en }: RawOpenFoodFactsData) { - this.input = [code]; - this.output = [countries_en]; + @Column() + barcode_ean_13: string; + + @Column() + country: string; + + constructor(code: string, gs1_country: string) { + this.barcode_ean_13 = code; + this.country = gs1_country; + } + + fromRaw({ + code, + countries_en, + }: RawOpenFoodFactsData): OpenFoodFactsData { + if (!code || !countries_en || code.length !== 13) { + throw new InvalidData("Invalid data"); + } + + return new OpenFoodFactsData(code, countries_en); + } + + asData(openData: OpenFoodFactsData): any { + return { + "Barcode (EAN 13)": openData.barcode_ean_13, + Country: openData.country, + }; } } diff --git a/src/services/data/WorldCitiesData.ts b/src/services/data/WorldCitiesData.ts index 630a3d5..528fdb6 100644 --- a/src/services/data/WorldCitiesData.ts +++ b/src/services/data/WorldCitiesData.ts @@ -1,19 +1,44 @@ -import { Data } from "./"; +import { Data, InvalidData } from "./"; +import { Column, Entity, PrimaryGeneratedColumn } from "typeorm"; -type RawSmolaData = { +type RawWorldCitiesData = { name: string; country: string; subcountry: string; geonameid: string; }; +@Entity() class WorldCitiesData implements Data { - input: string[]; - output: string[]; + @PrimaryGeneratedColumn({ + type: "integer", + }) + id?: number; - constructor({ geonameid, country }: RawSmolaData) { - this.input = [geonameid]; - this.output = [country]; + @Column() + geoname_id: string; + + @Column() + country: string; + + constructor(geonameId: string, country: string) { + this.geoname_id = geonameId; + this.country = country; + } + + fromRaw({ geonameid, country }: RawWorldCitiesData): WorldCitiesData { + if (!geonameid || !country || geonameid.length !== 6) { + throw new InvalidData("Invalid data"); + } + + return new WorldCitiesData(geonameid, country); + } + + asData(worldCitiesData: WorldCitiesData): any { + return { + "Geoname ID": worldCitiesData.geoname_id, + Country: worldCitiesData.country, + }; } } diff --git a/src/services/data/index.ts b/src/services/data/index.ts index 044db4e..07b982e 100644 --- a/src/services/data/index.ts +++ b/src/services/data/index.ts @@ -1,4 +1,4 @@ -export { default as Data, DataConstructor } from "./Data"; +export { default as Data, InvalidData } from "./Data"; export { default as NudgerData } from "./NudgerData"; export { default as OpenFoodFactsData } from "./OpenFoodFactsData"; diff --git a/src/services/dataset/Dataset.ts b/src/services/dataset/Dataset.ts index 8a9df20..e0f991b 100644 --- a/src/services/dataset/Dataset.ts +++ b/src/services/dataset/Dataset.ts @@ -1,18 +1,19 @@ -import { pipeline, Transform } from "node:stream"; +import { pipeline, Transform, Writable } from "node:stream"; import { promisify } from "node:util"; -import * as fs from "node:fs"; -import * as readline from "node:readline"; +import { Validator } from "jsonschema"; -import CacheService from "../CacheService"; import FileService from "../FileService"; import { ArchiveFactory, ArchiveType } from "../archive"; import { ParserFactory, ParserType } from "../parser"; -import { Data, DataConstructor } from "../data"; +import { Data, InvalidData} from "../data"; +import { AppDataSource } from "../../AppDataSource"; +import { EntityManager, EntityTarget, Repository } from "typeorm"; type DatasetParams = { id: string; - dataType: DataConstructor; + dataConstructor: (params: any) => Data; + dataType: Data; source: string; file: string; archiveType: ArchiveType; @@ -27,14 +28,14 @@ type DatasetOptions = { /** * Represents a dataset that can be loaded and queried */ -class Dataset { +class Dataset { readonly id: string; readonly source: string; readonly file: string; readonly archiveType: ArchiveType; readonly parserType: ParserType; - readonly cachePath: string; - private dataType: DataConstructor; + readonly dataConstructor: (params: any) => Data; + readonly dataType: Data; private options?: DatasetOptions; /** @@ -44,27 +45,28 @@ class Dataset { * @param file - The name of the file in the archive * @param dataType - The constructor of the data class * @param archiveType - The type of the archive - * @param datasetType - The type of the dataset + * @param dataConstructor - The type of the dataset + * @param parserType * @param options - Additional options for the dataset */ constructor({ id, source, file, + dataConstructor, dataType, archiveType, parserType, options, }: DatasetParams) { this.id = id; - this.dataType = dataType; + this.dataConstructor = dataConstructor; this.source = source; this.file = file; + this.dataType = dataType; this.archiveType = archiveType; this.parserType = parserType; this.options = options; - - this.cachePath = CacheService.getCachePath(this.source, ".json"); } /** @@ -73,10 +75,12 @@ class Dataset { * @throws {Error} - If the dataset cannot be loaded */ public async load(): Promise { - if (CacheService.isCached(this.source, ".json")) { - console.log(`Already cached: ${this.source}`); - return; - } + // const repository: Repository = AppDataSource.getRepository(Data); + + // if ((await repository.count()) > 0) { + // console.log(`Already cached: ${this.source}`); + // return; + // } const archive = ArchiveFactory.getArchive(this.archiveType); const parser = ParserFactory.getParser(this.parserType); @@ -84,31 +88,47 @@ class Dataset { const pipelineAsync = promisify(pipeline); console.log(`Download: ${this.source}`); - await pipelineAsync( - await FileService.getFileStream(this.source), - archive.extract(this.file), - parser.parse(this.options?.parser), - Dataset.transformToData(this.dataType), - FileService.createWriteStream(this.cachePath) - ) - .then(() => { - console.log(`Loaded: ${this.source}`); - }) - .catch((err) => { - console.error(`Failed to load dataset: ${this.source}`); - FileService.deleteFile(this.cachePath); - throw err; - }); + + // Start transaction + await AppDataSource.manager.transaction(async (manager) => { + await pipelineAsync( + await FileService.getFileStream(this.source), + archive.extract(this.file), + parser.parse(this.options?.parser), + Dataset.transformToData(this.dataConstructor, manager), + new Writable({ + objectMode: true, + write(chunk, _, callback) { + callback(); + }, + }) + ) + .then(() => { + console.log(`Loaded: ${this.source}`); + }) + .catch((err) => { + console.error(`Failed to load dataset: ${this.source}`); + throw err; + }); + }); } - private static transformToData(dataType: DataConstructor): Transform { + private static transformToData( + dataType: (params: any) => Data, + manager: EntityManager + ): Transform { return new Transform({ objectMode: true, - transform(chunk: object, _, callback) { - const data: Data = new dataType(chunk); - this.push(JSON.stringify(data) + "\n"); - - callback(null, JSON.stringify(data) + "\n"); + async transform(chunk: object, _, callback) { + try { + const data: Data = dataType(chunk); + await manager.save(data); + callback(null, JSON.stringify(data) + "\n"); + } catch (err: any) { + if (err instanceof InvalidData) { + callback(null, ""); + } else callback(err); + } }, }); } @@ -118,51 +138,75 @@ class Dataset { * @param length - The number of data entries to get (default: 10) * @param schema - Schema of the expected data returned */ - public get( - length: number = 10, - schema: { input: string[] | undefined; output: string[] | undefined } - ): Promise { + public async get(length: number = 10, schema: {}): Promise { + const dataRepository = AppDataSource.manager.getRepository( + this.dataType as EntityTarget + ); + + const datas = await dataRepository + .createQueryBuilder("data") + .orderBy("RAND()") // Fonction RAND() pour randomiser l'ordre + .limit(length) // Limiter le nombre de résultats + .getMany(); + return new Promise((resolve, reject) => { let count: number = 0; - const results: any[] = []; + const results: Data[] = []; + const validator = new Validator(); - const stream = fs.createReadStream(this.cachePath, { encoding: "utf8" }); - const rl = readline.createInterface({ - input: stream, - crlfDelay: Infinity, - }); + datas.forEach((data) => { + let randomizedData = D.fromRaw(data); + // this.dataConstructor(data); - rl.on("line", (line) => { - if (count < length) { - const data: Data = JSON.parse(line) as Data; - - // Create an object with the input and output values according to the schema - const obj: any = {}; - schema.input?.forEach((input: string, index: number) => { - obj[input] = data.input[index]; - }); - schema.output?.forEach((output, index) => { - obj[output] = data.output[index]; - }); - - // Add the object to the results - results.push(obj); + if (validator.validate(randomizedData, schema)) { + results.push(randomizedData); count++; - } else { - rl.close(); // Fermer le flux si on a atteint les n objets } }); - - // Quand le flux est terminé ou a été fermé. - rl.on("close", () => { - resolve(results); // Renvoie les n objets lus - }); - - // Gérer les erreurs du flux de lecture - rl.on("error", (err) => { - reject(err); - }); + return resolve(results); }); + // // + // // const stream = fs.createReadStream(this.cachePath, { encoding: "utf8" }); + // // const rl = readline.createInterface({ + // // input: stream, + // // crlfDelay: Infinity, + // // }); + // // + // // + // // rl.on("line", (line) => { + // // if (count < length) { + // // const data: Data = JSON.parse(line) as Data; + // // if (validator.validate(data, schema)) { + // // results.push(data); + // // count++; + // // } + // // + // // // // Pour chaque objet, récupérer l'objet et vérifier que le schéma est valide + // // + // // // schema.input?.forEach((input: string, index: number) => { + // // // obj[input] = data.input[index]; + // // // }); + // // // schema.output?.forEach((output, index) => { + // // // obj[output] = data.output[index]; + // // // }); + // // + // // // // Add the object to the results + // // // count++; + // // } else { + // // rl.close(); // Fermer le flux si on a atteint les n objets + // // } + // // }); + // // + // // // Quand le flux est terminé ou a été fermé. + // // rl.on("close", () => { + // // resolve(results); // Renvoie les n objets lus + // // }); + // // + // // // Gérer les erreurs du flux de lecture + // // rl.on("error", (err) => { + // // reject(err); + // // }); + // // }); } } diff --git a/src/services/dataset/DatasetCollection.ts b/src/services/dataset/DatasetCollection.ts index 0f8a4b0..1fc16b6 100644 --- a/src/services/dataset/DatasetCollection.ts +++ b/src/services/dataset/DatasetCollection.ts @@ -1,44 +1,47 @@ -import { NudgerData, OpenFoodFactsData, WorldCitiesData } from "../data"; +import { Data, NudgerData, OpenFoodFactsData, WorldCitiesData } from "../data"; import { ArchiveType } from "../archive"; import { Dataset } from "./"; import { ParserType } from "../parser"; class DatasetCollection { - public static datasets: Dataset[] = [ - new Dataset({ - id: "nudger", - source: - "https://files.opendatarchives.fr/data.cquest.org/open4goods/gtin-open-data.zip", - file: "open4goods-full-gtin-dataset.csv", - dataType: NudgerData, - archiveType: ArchiveType.ZIP, - parserType: ParserType.CSV, - options: { - parser: { - delimiter: ",", - }, - }, - }), - new Dataset({ - id: "openfoodfacts", - source: - "https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv.gz", - file: "en.openfoodfacts.org.products.csv", - dataType: OpenFoodFactsData, - archiveType: ArchiveType.GZIP, - parserType: ParserType.CSV, - options: { - parser: { - delimiter: "\t", - quote: null, - }, - }, - }), - new Dataset({ + public static datasets: Dataset[] = [ + // new Dataset({ + // id: "nudger", + // source: + // "https://files.opendatarchives.fr/data.cquest.org/open4goods/gtin-open-data.zip", + // file: "open4goods-full-gtin-dataset.csv", + // dataConstructor: NudgerData.fromRaw, + // dataType: NudgerData, + // archiveType: ArchiveType.ZIP, + // parserType: ParserType.CSV, + // options: { + // parser: { + // delimiter: ",", + // }, + // }, + // }), + // new Dataset({ + // id: "openfoodfacts", + // source: + // "https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv.gz", + // file: "en.openfoodfacts.org.products.csv", + // dataConstructor: OpenFoodFactsData.fromRaw, + // dataType: OpenFoodFactsData, + // archiveType: ArchiveType.GZIP, + // parserType: ParserType.CSV, + // options: { + // parser: { + // delimiter: "\t", + // quote: null, + // }, + // }, + // }), + new Dataset({ id: "world-cities", source: "https://raw.githubusercontent.com/datasets/world-cities/refs/heads/main/data/world-cities.csv", file: "world-cities.csv", + dataConstructor: WorldCitiesData.fromRaw, dataType: WorldCitiesData, archiveType: ArchiveType.NONE, parserType: ParserType.CSV, diff --git a/src/services/dmn/DMN.ts b/src/services/dmn/DMN.ts index f6d07b2..dfd482b 100644 --- a/src/services/dmn/DMN.ts +++ b/src/services/dmn/DMN.ts @@ -6,6 +6,8 @@ import { Name_of_DMN_InputClause, Name_of_DMN_OutputClause, Definitions, + InputClause, + OutputClause, } from "./interfaces"; export class DMN { @@ -17,18 +19,49 @@ export class DMN { } public static getSchema(dmnDefinitions: Definitions) { - const descisions: Decision[] = dmnDefinitions.drgElement.filter((element) => + const { inputs, outputs } = this.getInputOutput(dmnDefinitions); + const properties = this.getProperties(inputs || [], outputs || []); + + return { + type: "object", + properties, + required: Object.keys(properties), + }; + } + + private static getInputOutput(dmnDefinitions: Definitions) { + const decisions: Decision[] = dmnDefinitions.drgElement.filter((element) => Is_DMN_Decision(element) ); - const { input, output } = descisions + const { input: inputs, output: outputs } = decisions .map((decision) => decision.decisionLogic) .filter((decisionLogic) => Is_DMN_DecisionTable(decisionLogic))[0]; - // TODO generate json schema + return { inputs, outputs }; + } - return { - input: input?.map((input) => Name_of_DMN_InputClause(input)), - output: output?.map((output) => Name_of_DMN_OutputClause(output)), - }; + private static getProperties(inputs: InputClause[], outputs: OutputClause[]) { + let properties = {}; + + inputs.forEach((input) => { + const name = Name_of_DMN_InputClause(input) as string; + const type = input.typeRef || "string"; + // @ts-ignore + properties[name] = { + type, + }; + }); + + outputs.forEach((output) => { + const name = Name_of_DMN_OutputClause(output) as string; + const type = output.typeRef || "string"; + + // @ts-ignore + properties[name] = { + type, + }; + }); + + return properties; } }