feat: Use MongoDB to save the datalake

This commit is contained in:
Lucàs
2024-11-11 23:09:59 +01:00
parent cde872ca55
commit 6975765e18
36 changed files with 862 additions and 517 deletions
+3
View File
@@ -7,6 +7,9 @@
cache/ cache/
node_modules/ node_modules/
# Docker
docker/
# Build files # Build files
build/ build/
+36 -1
View File
@@ -1,11 +1,46 @@
# Use root/example as user/password credentials
version: "3.1" version: "3.1"
services: services:
mongo:
image: mongo
container_name: data-db
restart: "no"
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: root
MONGO_INITDB_DATABASE: db
volumes:
- ./docker/mongodb/db:/data/db
- ./docker/mongodb/logs:/var/log/mongodb
command:
["mongod", "--logpath", "/var/log/mongodb/mongodb.log", "--logappend"]
deploy:
resources:
limits:
memory: 4g
cpus: "2.0"
reservations:
memory: 2g
ports:
- "27017:27017"
mariadb: mariadb:
image: mariadb:latest image: mariadb
container_name: log-db
restart: "no" restart: "no"
environment: environment:
MYSQL_ROOT_PASSWORD: root MYSQL_ROOT_PASSWORD: root
MYSQL_DATABASE: db MYSQL_DATABASE: db
MYSQL_USER: user
MYSQL_PASSWORD: example
ports: ports:
- "3306:3306" - "3306:3306"
volumes:
- ./docker/mariadb/db:/var/lib/mysql
volumes:
mongodb_data:
driver: local
mariadb_data:
driver: local
+15
View File
@@ -0,0 +1,15 @@
# mongod.conf
storage:
dbPath: /data/db
wiredTiger:
engineConfig:
cacheSizeGB: 2 # Ajuster selon la mémoire disponible (ex: 2 Go)
systemLog:
destination: file
logAppend: true
path: /data/db/mongodb.log
operationProfiling:
mode: slowOp
slowOpThresholdMs: 1000 # Considère une opération lente si elle dépasse 1 seconde
net:
bindIp: 0.0.0.0
+412
View File
@@ -16,6 +16,7 @@
"express": "^4.21.0", "express": "^4.21.0",
"fast-csv": "^5.0.1", "fast-csv": "^5.0.1",
"jsonschema": "^1.4.1", "jsonschema": "^1.4.1",
"mongoose": "^8.8.1",
"mysql": "^2.18.1", "mysql": "^2.18.1",
"reflect-metadata": "^0.2.2", "reflect-metadata": "^0.2.2",
"tar-stream": "^3.1.7", "tar-stream": "^3.1.7",
@@ -24,6 +25,7 @@
}, },
"devDependencies": { "devDependencies": {
"@types/express": "^4.17.21", "@types/express": "^4.17.21",
"@types/mongoose": "^5.11.96",
"@types/node": "^22.5.5", "@types/node": "^22.5.5",
"@types/tar-stream": "^3.1.3", "@types/tar-stream": "^3.1.3",
"@types/unzipper": "^0.10.10", "@types/unzipper": "^0.10.10",
@@ -117,6 +119,15 @@
"@jridgewell/sourcemap-codec": "^1.4.10" "@jridgewell/sourcemap-codec": "^1.4.10"
} }
}, },
"node_modules/@mongodb-js/saslprep": {
"version": "1.1.9",
"resolved": "https://registry.npmjs.org/@mongodb-js/saslprep/-/saslprep-1.1.9.tgz",
"integrity": "sha512-tVkljjeEaAhCqTzajSdgbQ6gE6f3oneVwa3iXR6csiEwXXOFsiC6Uh9iAjAhXPtqa/XMDHWjjeNH/77m/Yq2dw==",
"license": "MIT",
"dependencies": {
"sparse-bitfield": "^3.0.3"
}
},
"node_modules/@pkgjs/parseargs": { "node_modules/@pkgjs/parseargs": {
"version": "0.11.0", "version": "0.11.0",
"resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
@@ -222,6 +233,16 @@
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },
"node_modules/@types/mongoose": {
"version": "5.11.96",
"resolved": "https://registry.npmjs.org/@types/mongoose/-/mongoose-5.11.96.tgz",
"integrity": "sha512-keiY22ljJtXyM7osgScmZOHV6eL5VFUD5tQumlu+hjS++HND5nM8jNEdj5CSWfKIJpVwQfPuwQ2SfBqUnCAVRw==",
"dev": true,
"license": "MIT",
"dependencies": {
"mongoose": "*"
}
},
"node_modules/@types/node": { "node_modules/@types/node": {
"version": "22.5.5", "version": "22.5.5",
"resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.5.tgz", "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.5.tgz",
@@ -289,6 +310,24 @@
"@types/node": "*" "@types/node": "*"
} }
}, },
"node_modules/@types/webidl-conversions": {
"version": "7.0.3",
"resolved": "https://registry.npmjs.org/@types/webidl-conversions/-/webidl-conversions-7.0.3.tgz",
"integrity": "sha512-CiJJvcRtIgzadHCYXw7dqEnMNRjhGZlYK05Mj9OyktqV8uVT8fD2BFOB7S1uwBE3Kj2Z+4UyPmFw/Ixgw/LAlA==",
"license": "MIT"
},
"node_modules/@types/whatwg-url": {
"version": "8.2.2",
"resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-8.2.2.tgz",
"integrity": "sha512-FtQu10RWgn3D9U4aazdwIE2yzphmTJREDqNdODHrbrZmmMqI0vMheC/6NE/J1Yveaj8H+ela+YwWTjq5PGmuhA==",
"license": "MIT",
"optional": true,
"peer": true,
"dependencies": {
"@types/node": "*",
"@types/webidl-conversions": "*"
}
},
"node_modules/accepts": { "node_modules/accepts": {
"version": "1.3.8", "version": "1.3.8",
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz", "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz",
@@ -487,6 +526,15 @@
"balanced-match": "^1.0.0" "balanced-match": "^1.0.0"
} }
}, },
"node_modules/bson": {
"version": "6.9.0",
"resolved": "https://registry.npmjs.org/bson/-/bson-6.9.0.tgz",
"integrity": "sha512-X9hJeyeM0//Fus+0pc5dSUMhhrrmWwQUtdavaQeF3Ta6m69matZkGWV/MrBcnwUeLC8W9kwwc2hfkZgUuCX3Ig==",
"license": "Apache-2.0",
"engines": {
"node": ">=16.20.1"
}
},
"node_modules/buffer": { "node_modules/buffer": {
"version": "6.0.3", "version": "6.0.3",
"resolved": "https://registry.npmjs.org/buffer/-/buffer-6.0.3.tgz", "resolved": "https://registry.npmjs.org/buffer/-/buffer-6.0.3.tgz",
@@ -1367,6 +1415,21 @@
"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
"license": "ISC" "license": "ISC"
}, },
"node_modules/ip-address": {
"version": "9.0.5",
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-9.0.5.tgz",
"integrity": "sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g==",
"license": "MIT",
"optional": true,
"peer": true,
"dependencies": {
"jsbn": "1.1.0",
"sprintf-js": "^1.1.3"
},
"engines": {
"node": ">= 12"
}
},
"node_modules/ipaddr.js": { "node_modules/ipaddr.js": {
"version": "1.9.1", "version": "1.9.1",
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz",
@@ -1412,6 +1475,14 @@
"@pkgjs/parseargs": "^0.11.0" "@pkgjs/parseargs": "^0.11.0"
} }
}, },
"node_modules/jsbn": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-1.1.0.tgz",
"integrity": "sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A==",
"license": "MIT",
"optional": true,
"peer": true
},
"node_modules/jsonfile": { "node_modules/jsonfile": {
"version": "6.1.0", "version": "6.1.0",
"resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.1.0.tgz", "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.1.0.tgz",
@@ -1433,6 +1504,15 @@
"node": "*" "node": "*"
} }
}, },
"node_modules/kareem": {
"version": "2.6.3",
"resolved": "https://registry.npmjs.org/kareem/-/kareem-2.6.3.tgz",
"integrity": "sha512-C3iHfuGUXK2u8/ipq9LfjFfXFxAZMQJJq7vLS45r3D9Y2xQ/m4S8zaR4zMLFWh9AsNPXmcFfUDhTEO8UIC/V6Q==",
"license": "Apache-2.0",
"engines": {
"node": ">=12.0.0"
}
},
"node_modules/lodash.escaperegexp": { "node_modules/lodash.escaperegexp": {
"version": "4.1.2", "version": "4.1.2",
"resolved": "https://registry.npmjs.org/lodash.escaperegexp/-/lodash.escaperegexp-4.1.2.tgz", "resolved": "https://registry.npmjs.org/lodash.escaperegexp/-/lodash.escaperegexp-4.1.2.tgz",
@@ -1503,6 +1583,12 @@
"node": ">= 0.6" "node": ">= 0.6"
} }
}, },
"node_modules/memory-pager": {
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/memory-pager/-/memory-pager-1.5.0.tgz",
"integrity": "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==",
"license": "MIT"
},
"node_modules/merge-descriptors": { "node_modules/merge-descriptors": {
"version": "1.0.3", "version": "1.0.3",
"resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz", "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz",
@@ -1619,6 +1705,234 @@
"saxen": "^8.1.2" "saxen": "^8.1.2"
} }
}, },
"node_modules/mongodb": {
"version": "5.9.2",
"resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz",
"integrity": "sha512-H60HecKO4Bc+7dhOv4sJlgvenK4fQNqqUIlXxZYQNbfEWSALGAwGoyJd/0Qwk4TttFXUOHJ2ZJQe/52ScaUwtQ==",
"license": "Apache-2.0",
"optional": true,
"peer": true,
"dependencies": {
"bson": "^5.5.0",
"mongodb-connection-string-url": "^2.6.0",
"socks": "^2.7.1"
},
"engines": {
"node": ">=14.20.1"
},
"optionalDependencies": {
"@mongodb-js/saslprep": "^1.1.0"
},
"peerDependencies": {
"@aws-sdk/credential-providers": "^3.188.0",
"@mongodb-js/zstd": "^1.0.0",
"kerberos": "^1.0.0 || ^2.0.0",
"mongodb-client-encryption": ">=2.3.0 <3",
"snappy": "^7.2.2"
},
"peerDependenciesMeta": {
"@aws-sdk/credential-providers": {
"optional": true
},
"@mongodb-js/zstd": {
"optional": true
},
"kerberos": {
"optional": true
},
"mongodb-client-encryption": {
"optional": true
},
"snappy": {
"optional": true
}
}
},
"node_modules/mongodb-connection-string-url": {
"version": "2.6.0",
"resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-2.6.0.tgz",
"integrity": "sha512-WvTZlI9ab0QYtTYnuMLgobULWhokRjtC7db9LtcVfJ+Hsnyr5eo6ZtNAt3Ly24XZScGMelOcGtm7lSn0332tPQ==",
"license": "Apache-2.0",
"optional": true,
"peer": true,
"dependencies": {
"@types/whatwg-url": "^8.2.1",
"whatwg-url": "^11.0.0"
}
},
"node_modules/mongodb/node_modules/bson": {
"version": "5.5.1",
"resolved": "https://registry.npmjs.org/bson/-/bson-5.5.1.tgz",
"integrity": "sha512-ix0EwukN2EpC0SRWIj/7B5+A6uQMQy6KMREI9qQqvgpkV2frH63T0UDVd1SYedL6dNCmDBYB3QtXi4ISk9YT+g==",
"license": "Apache-2.0",
"optional": true,
"peer": true,
"engines": {
"node": ">=14.20.1"
}
},
"node_modules/mongoose": {
"version": "8.8.1",
"resolved": "https://registry.npmjs.org/mongoose/-/mongoose-8.8.1.tgz",
"integrity": "sha512-l7DgeY1szT98+EKU8GYnga5WnyatAu+kOQ2VlVX1Mxif6A0Umt0YkSiksCiyGxzx8SPhGe9a53ND1GD4yVDrPA==",
"license": "MIT",
"dependencies": {
"bson": "^6.7.0",
"kareem": "2.6.3",
"mongodb": "~6.10.0",
"mpath": "0.9.0",
"mquery": "5.0.0",
"ms": "2.1.3",
"sift": "17.1.3"
},
"engines": {
"node": ">=16.20.1"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/mongoose"
}
},
"node_modules/mongoose/node_modules/@types/whatwg-url": {
"version": "11.0.5",
"resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-11.0.5.tgz",
"integrity": "sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==",
"license": "MIT",
"dependencies": {
"@types/webidl-conversions": "*"
}
},
"node_modules/mongoose/node_modules/mongodb": {
"version": "6.10.0",
"resolved": "https://registry.npmjs.org/mongodb/-/mongodb-6.10.0.tgz",
"integrity": "sha512-gP9vduuYWb9ZkDM546M+MP2qKVk5ZG2wPF63OvSRuUbqCR+11ZCAE1mOfllhlAG0wcoJY5yDL/rV3OmYEwXIzg==",
"license": "Apache-2.0",
"dependencies": {
"@mongodb-js/saslprep": "^1.1.5",
"bson": "^6.7.0",
"mongodb-connection-string-url": "^3.0.0"
},
"engines": {
"node": ">=16.20.1"
},
"peerDependencies": {
"@aws-sdk/credential-providers": "^3.188.0",
"@mongodb-js/zstd": "^1.1.0",
"gcp-metadata": "^5.2.0",
"kerberos": "^2.0.1",
"mongodb-client-encryption": ">=6.0.0 <7",
"snappy": "^7.2.2",
"socks": "^2.7.1"
},
"peerDependenciesMeta": {
"@aws-sdk/credential-providers": {
"optional": true
},
"@mongodb-js/zstd": {
"optional": true
},
"gcp-metadata": {
"optional": true
},
"kerberos": {
"optional": true
},
"mongodb-client-encryption": {
"optional": true
},
"snappy": {
"optional": true
},
"socks": {
"optional": true
}
}
},
"node_modules/mongoose/node_modules/mongodb-connection-string-url": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-3.0.1.tgz",
"integrity": "sha512-XqMGwRX0Lgn05TDB4PyG2h2kKO/FfWJyCzYQbIhXUxz7ETt0I/FqHjUeqj37irJ+Dl1ZtU82uYyj14u2XsZKfg==",
"license": "Apache-2.0",
"dependencies": {
"@types/whatwg-url": "^11.0.2",
"whatwg-url": "^13.0.0"
}
},
"node_modules/mongoose/node_modules/ms": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
"license": "MIT"
},
"node_modules/mongoose/node_modules/tr46": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/tr46/-/tr46-4.1.1.tgz",
"integrity": "sha512-2lv/66T7e5yNyhAAC4NaKe5nVavzuGJQVVtRYLyQ2OI8tsJ61PMLlelehb0wi2Hx6+hT/OJUWZcw8MjlSRnxvw==",
"license": "MIT",
"dependencies": {
"punycode": "^2.3.0"
},
"engines": {
"node": ">=14"
}
},
"node_modules/mongoose/node_modules/whatwg-url": {
"version": "13.0.0",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-13.0.0.tgz",
"integrity": "sha512-9WWbymnqj57+XEuqADHrCJ2eSXzn8WXIW/YSGaZtb2WKAInQ6CHfaUUcTyyver0p8BDg5StLQq8h1vtZuwmOig==",
"license": "MIT",
"dependencies": {
"tr46": "^4.1.1",
"webidl-conversions": "^7.0.0"
},
"engines": {
"node": ">=16"
}
},
"node_modules/mpath": {
"version": "0.9.0",
"resolved": "https://registry.npmjs.org/mpath/-/mpath-0.9.0.tgz",
"integrity": "sha512-ikJRQTk8hw5DEoFVxHG1Gn9T/xcjtdnOKIU1JTmGjZZlg9LST2mBLmcX3/ICIbgJydT2GOc15RnNy5mHmzfSew==",
"license": "MIT",
"engines": {
"node": ">=4.0.0"
}
},
"node_modules/mquery": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/mquery/-/mquery-5.0.0.tgz",
"integrity": "sha512-iQMncpmEK8R8ncT8HJGsGc9Dsp8xcgYMVSbs5jgnm1lFHTZqMJTUWTDx1LBO8+mK3tPNZWFLBghQEIOULSTHZg==",
"license": "MIT",
"dependencies": {
"debug": "4.x"
},
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/mquery/node_modules/debug": {
"version": "4.3.7",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz",
"integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==",
"license": "MIT",
"dependencies": {
"ms": "^2.1.3"
},
"engines": {
"node": ">=6.0"
},
"peerDependenciesMeta": {
"supports-color": {
"optional": true
}
}
},
"node_modules/mquery/node_modules/ms": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
"license": "MIT"
},
"node_modules/ms": { "node_modules/ms": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
@@ -1828,6 +2142,15 @@
"integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==", "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/punycode": {
"version": "2.3.1",
"resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
"integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==",
"license": "MIT",
"engines": {
"node": ">=6"
}
},
"node_modules/qs": { "node_modules/qs": {
"version": "6.13.0", "version": "6.13.0",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz", "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz",
@@ -2070,6 +2393,12 @@
"url": "https://github.com/sponsors/ljharb" "url": "https://github.com/sponsors/ljharb"
} }
}, },
"node_modules/sift": {
"version": "17.1.3",
"resolved": "https://registry.npmjs.org/sift/-/sift-17.1.3.tgz",
"integrity": "sha512-Rtlj66/b0ICeFzYTuNvX/EF1igRbbnGSvEyT79McoZa/DeGhMyC5pWKOEsZKnpkqtSeovd5FL/bjHWC3CIIvCQ==",
"license": "MIT"
},
"node_modules/signal-exit": { "node_modules/signal-exit": {
"version": "4.1.0", "version": "4.1.0",
"resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
@@ -2082,6 +2411,51 @@
"url": "https://github.com/sponsors/isaacs" "url": "https://github.com/sponsors/isaacs"
} }
}, },
"node_modules/smart-buffer": {
"version": "4.2.0",
"resolved": "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.2.0.tgz",
"integrity": "sha512-94hK0Hh8rPqQl2xXc3HsaBoOXKV20MToPkcXvwbISWLEs+64sBq5kFgn2kJDHb1Pry9yrP0dxrCI9RRci7RXKg==",
"license": "MIT",
"optional": true,
"peer": true,
"engines": {
"node": ">= 6.0.0",
"npm": ">= 3.0.0"
}
},
"node_modules/socks": {
"version": "2.8.3",
"resolved": "https://registry.npmjs.org/socks/-/socks-2.8.3.tgz",
"integrity": "sha512-l5x7VUUWbjVFbafGLxPWkYsHIhEvmF85tbIeFZWc8ZPtoMyybuEhL7Jye/ooC4/d48FgOjSJXgsF/AJPYCW8Zw==",
"license": "MIT",
"optional": true,
"peer": true,
"dependencies": {
"ip-address": "^9.0.5",
"smart-buffer": "^4.2.0"
},
"engines": {
"node": ">= 10.0.0",
"npm": ">= 3.0.0"
}
},
"node_modules/sparse-bitfield": {
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/sparse-bitfield/-/sparse-bitfield-3.0.3.tgz",
"integrity": "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==",
"license": "MIT",
"dependencies": {
"memory-pager": "^1.0.2"
}
},
"node_modules/sprintf-js": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.1.3.tgz",
"integrity": "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==",
"license": "BSD-3-Clause",
"optional": true,
"peer": true
},
"node_modules/sqlstring": { "node_modules/sqlstring": {
"version": "2.3.1", "version": "2.3.1",
"resolved": "https://registry.npmjs.org/sqlstring/-/sqlstring-2.3.1.tgz", "resolved": "https://registry.npmjs.org/sqlstring/-/sqlstring-2.3.1.tgz",
@@ -2287,6 +2661,20 @@
"node": ">=0.6" "node": ">=0.6"
} }
}, },
"node_modules/tr46": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/tr46/-/tr46-3.0.0.tgz",
"integrity": "sha512-l7FvfAHlcmulp8kr+flpQZmVwtu7nfRV7NZujtN0OqES8EL4O4e0qqzL0DC5gAvx/ZC/9lk6rhcUwYvkBnBnYA==",
"license": "MIT",
"optional": true,
"peer": true,
"dependencies": {
"punycode": "^2.1.1"
},
"engines": {
"node": ">=12"
}
},
"node_modules/ts-node": { "node_modules/ts-node": {
"version": "10.9.2", "version": "10.9.2",
"resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.2.tgz", "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.2.tgz",
@@ -2575,6 +2963,30 @@
"node": ">= 0.8" "node": ">= 0.8"
} }
}, },
"node_modules/webidl-conversions": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
"integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==",
"license": "BSD-2-Clause",
"engines": {
"node": ">=12"
}
},
"node_modules/whatwg-url": {
"version": "11.0.0",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-11.0.0.tgz",
"integrity": "sha512-RKT8HExMpoYx4igMiVMY83lN6UeITKJlBQ+vR/8ZJ8OCdSiN3RwCq+9gH0+Xzj0+5IrM6i4j/6LuvzbZIQgEcQ==",
"license": "MIT",
"optional": true,
"peer": true,
"dependencies": {
"tr46": "^3.0.0",
"webidl-conversions": "^7.0.0"
},
"engines": {
"node": ">=12"
}
},
"node_modules/which": { "node_modules/which": {
"version": "2.0.2", "version": "2.0.2",
"resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
+2
View File
@@ -22,6 +22,7 @@
"express": "^4.21.0", "express": "^4.21.0",
"fast-csv": "^5.0.1", "fast-csv": "^5.0.1",
"jsonschema": "^1.4.1", "jsonschema": "^1.4.1",
"mongoose": "^8.8.1",
"mysql": "^2.18.1", "mysql": "^2.18.1",
"reflect-metadata": "^0.2.2", "reflect-metadata": "^0.2.2",
"tar-stream": "^3.1.7", "tar-stream": "^3.1.7",
@@ -30,6 +31,7 @@
}, },
"devDependencies": { "devDependencies": {
"@types/express": "^4.17.21", "@types/express": "^4.17.21",
"@types/mongoose": "^5.11.96",
"@types/node": "^22.5.5", "@types/node": "^22.5.5",
"@types/tar-stream": "^3.1.3", "@types/tar-stream": "^3.1.3",
"@types/unzipper": "^0.10.10", "@types/unzipper": "^0.10.10",
+2 -2
View File
@@ -1,7 +1,7 @@
import express from "express"; import express from "express";
import routes from "./routes"; import routes from "./routes";
import { createServer } from "node:http"; import { createServer } from "node:http";
import { logger, xmlBodyParser } from "./middlewares"; import { bodyToSchema, logger, xmlBodyParser } from "./middlewares";
export default class Server { export default class Server {
private readonly app: express.Application; private readonly app: express.Application;
@@ -9,7 +9,7 @@ export default class Server {
constructor() { constructor() {
this.app = express(); this.app = express();
this.app.use(express.json(), xmlBodyParser, logger, routes); this.app.use(express.json(), xmlBodyParser, bodyToSchema, logger, routes);
} }
public start() { public start() {
@@ -1,13 +1,8 @@
import "reflect-metadata"; import "reflect-metadata";
import { DataSource } from "typeorm"; import { DataSource } from "typeorm";
import { Log } from "./entity/Log"; import { Log } from "./entity/Log";
import {
NudgerData,
OpenFoodFactsData,
WorldCitiesData,
} from "./services/data";
export const AppDataSource = new DataSource({ export const TypeOrmDataSource = new DataSource({
type: "mariadb", type: "mariadb",
host: "localhost", host: "localhost",
port: 3306, port: 3306,
@@ -16,7 +11,7 @@ export const AppDataSource = new DataSource({
database: "db", database: "db",
synchronize: true, synchronize: true,
logging: false, logging: false,
entities: [Log, NudgerData, OpenFoodFactsData, WorldCitiesData], entities: [Log],
subscribers: [], subscribers: [],
migrations: [], migrations: [],
}); });
+8 -2
View File
@@ -1,11 +1,17 @@
import dotenv from "dotenv"; import dotenv from "dotenv";
import Server from "./Server"; import Server from "./Server";
import { DatasetCollection } from "./services/dataset"; import { DatasetCollection } from "./services/dataset";
import { AppDataSource } from "./AppDataSource"; import { TypeOrmDataSource } from "./TypeOrmDataSource";
dotenv.config(); dotenv.config();
AppDataSource.initialize() // 1. initialize database connexions
// 2. Load all datasets
// 3. Start the server
TypeOrmDataSource.initialize()
.then(() => DatasetCollection.loadAll()) .then(() => DatasetCollection.loadAll())
.then(() => console.log("All datasets are loaded")) .then(() => console.log("All datasets are loaded"))
.then(() => new Server().start()) .then(() => new Server().start())
+15
View File
@@ -0,0 +1,15 @@
import { NextFunction, Request, Response } from "express";
import { Definitions } from "../services/dmn/interfaces";
import { DMN } from "../services/dmn/DMN";
export default async function (
req: Request,
res: Response,
next: NextFunction
) {
if (req.is("application/xml")) {
const dmn: Definitions = await DMN.parse(req.body);
req.body = DMN.getSchema(dmn);
next();
} else next();
}
+1
View File
@@ -1,2 +1,3 @@
export { default as logger } from "./logger"; export { default as logger } from "./logger";
export { default as xmlBodyParser } from "./xmlBodyParser"; export { default as xmlBodyParser } from "./xmlBodyParser";
export { default as bodyToSchema } from "./bodyToSchema";
+5 -2
View File
@@ -1,6 +1,6 @@
import { NextFunction, Request, Response } from "express"; import { NextFunction, Request, Response } from "express";
import { Log } from "../entity/Log"; import { Log } from "../entity/Log";
import { AppDataSource } from "../AppDataSource"; import { TypeOrmDataSource } from "../TypeOrmDataSource";
export default async function logger( export default async function logger(
req: Request, req: Request,
@@ -8,13 +8,16 @@ export default async function logger(
next: NextFunction next: NextFunction
) { ) {
console.info(`[${req.method}] ${req.url}`); console.info(`[${req.method}] ${req.url}`);
if (req.path === "/randomize") {
// Put the log into the database // Put the log into the database
const log: Log = new Log( const log: Log = new Log(
req.url, req.url,
req.method as any, req.method as any,
JSON.stringify(req.body) JSON.stringify(req.body)
); );
await AppDataSource.manager.save(log); await TypeOrmDataSource.manager.save(log);
}
next(); next();
} }
+2 -6
View File
@@ -4,14 +4,10 @@ export default function (req: Request, res: Response, next: NextFunction) {
if (req.is("application/xml")) { if (req.is("application/xml")) {
let data = ""; let data = "";
req.setEncoding("utf8"); req.setEncoding("utf8");
req.on("data", (chunk: any) => { req.on("data", (chunk: any) => (data += chunk));
data += chunk;
});
req.on("end", () => { req.on("end", () => {
req.body = data; req.body = data;
next(); next();
}); });
} else { } else next();
next();
}
} }
+16 -27
View File
@@ -1,31 +1,28 @@
import { Router, Request, Response } from "express"; import { Router, Request, Response } from "express";
import { DatasetCollection } from "../services/dataset"; import { DatasetCollection } from "../services/dataset";
import { DMN } from "../services/dmn/DMN"; import axios from "axios";
import { Definitions } from "../services/dmn/interfaces/";
import { Data } from "../services/data";
const router = Router(); const router = Router();
router.post("/randomize", (req: Request, res: Response) => { router.post("/randomize", (req: Request, res: Response) => {
const size: number = req.query.size const size: number = req.query.size ? parseInt(req.query.size as string) : 10;
? parseInt(req.query.size as string)
: 1000;
const datasetID = DatasetCollection.datasets.map((dataset) => dataset.id); DatasetCollection.getDatasetByMatchingSchema(req.body)
.then((endpoints) => {
// Split evenly the size between the datasets
const sizePerDataset = Math.floor(size / endpoints.length);
Promise.all( return Promise.all(
datasetID.map((id) => { endpoints.map(async (endpoint) => {
const url: URL = new URL(`http://localhost:4321/randomize/${id}`); const params = { size: sizePerDataset };
url.searchParams.append("size", size.toString());
return fetch(url, { return axios
method: "POST", .post(endpoint, req.body, { params })
body: req.body, .then((res) => res.data.data);
headers: { "Content-Type": "application/xml" },
}) })
.then((response) => response.json()) );
.then((json: any) => json.data);
}) })
).then((r) => { .then((r) => {
const data = r const data = r
.flat() .flat()
.sort(() => Math.random() - 0.5) .sort(() => Math.random() - 0.5)
@@ -47,15 +44,7 @@ router.post("/randomize/:id", async (req: Request, res: Response) => {
); );
if (!dataset) return res.status(404).json({ status: "NOT_FOUND" }); if (!dataset) return res.status(404).json({ status: "NOT_FOUND" });
const dmn: Definitions = await DMN.parse(req.body); const data = await dataset.get(size, req.body);
const schema = DMN.getSchema(dmn);
console.log(JSON.stringify(schema, null, 2));
const data: Data[] = await dataset.get(size, schema);
// Randomize the data (temporary)
data.sort(() => Math.random() - 0.5);
return res.status(200).json({ status: "RANDOMIZED", data }); return res.status(200).json({ status: "RANDOMIZED", data });
}); });
+19
View File
@@ -0,0 +1,19 @@
import { MongoClient, Db } from "mongodb";
const uri = "mongodb://localhost:27017";
const dbName = "db";
let db: Db;
export const getDatabaseConnexion = async (): Promise<Db> => {
if (!db) {
const client = new MongoClient(uri, {
auth: { username: "root", password: "root" },
connectTimeoutMS: 60000,
socketTimeoutMS: 60000,
});
await client.connect();
db = client.db(dbName);
}
return db;
};
-7
View File
@@ -1,7 +0,0 @@
import { Duplex } from "node:stream";
interface Archive {
extract(source: string): Duplex;
}
export default Archive;
-13
View File
@@ -1,13 +0,0 @@
import { Archive, ZipArchive, ArchiveType, GzipArchive, NoneArchive } from "./";
class ArchiveFactory {
static getArchive(archiveType: ArchiveType): Archive {
if (archiveType === ArchiveType.ZIP) return ZipArchive.instance;
if (archiveType === ArchiveType.GZIP) return GzipArchive.instance;
if (archiveType === ArchiveType.NONE) return NoneArchive.instance;
throw new Error("Unsupported archive type");
}
}
export default ArchiveFactory;
-7
View File
@@ -1,7 +0,0 @@
enum ArchiveType {
ZIP = ".zip",
GZIP = ".gzip",
NONE = "",
}
export default ArchiveType;
-13
View File
@@ -1,13 +0,0 @@
import { createGunzip } from "node:zlib";
import { Duplex } from "node:stream";
import { Archive } from "./";
class GzipArchive implements Archive {
public static instance: Archive = new GzipArchive();
public extract(source: string): Duplex {
return createGunzip();
}
}
export default GzipArchive;
-16
View File
@@ -1,16 +0,0 @@
import { Archive } from "./";
import { Duplex, Transform } from "node:stream";
class NoneArchive implements Archive {
public static instance: Archive = new NoneArchive();
public extract(source: string): Duplex {
return new Transform({
transform(chunk, _, callback) {
callback(null, chunk);
},
});
}
}
export default NoneArchive;
-15
View File
@@ -1,15 +0,0 @@
import { Archive } from "./";
import { Duplex } from "node:stream";
import { ParseOne } from "unzipper";
class ZipArchive implements Archive {
public static instance: Archive = new ZipArchive();
public extract(source: string): Duplex {
return ParseOne(new RegExp(source), {
forceStream: true,
});
}
}
export default ZipArchive;
-8
View File
@@ -1,8 +0,0 @@
export { default as ArchiveType } from "./ArchiveType";
export { default as Archive } from "./Archive";
export { default as ArchiveFactory } from "./ArchiveFactory";
export { default as ZipArchive } from "./ZipArchive";
export { default as GzipArchive } from "./GzipArchive";
export { default as NoneArchive } from "./NoneArchive";
@@ -0,0 +1,5 @@
import { Duplex } from "node:stream";
export default interface Extractor {
extract(options: { file: string }): Duplex;
}
@@ -0,0 +1,23 @@
import Extractor from "./Extractor";
import ZipExtractor from "./ZipExtractor";
import GzipExtractor from "./GzipExtractor";
import NoneExtractor from "./NoneExtractor";
export enum ExtractorType {
ZIP,
GZIP,
NONE,
}
export default class ExtractorFactory {
static getExtractor(extractorType: ExtractorType): Extractor {
switch (extractorType) {
case ExtractorType.ZIP:
return ZipExtractor.instance;
case ExtractorType.GZIP:
return GzipExtractor.instance;
default:
return NoneExtractor.instance;
}
}
}
@@ -0,0 +1,11 @@
import Extractor from "./Extractor";
import { Duplex } from "node:stream";
import { createGunzip } from "node:zlib";
export default class GzipExtractor implements Extractor {
public static instance = new GzipExtractor();
extract(_: any): Duplex {
return createGunzip();
}
}
@@ -0,0 +1,15 @@
import Extractor from "./Extractor";
import { Duplex, Transform } from "node:stream";
import { createGunzip } from "node:zlib";
export default class NoneExtractor implements Extractor {
public static instance = new NoneExtractor();
extract(_: any): Duplex {
return new Transform({
transform(chunk, _, callback) {
callback(null, chunk);
},
});
}
}
@@ -0,0 +1,13 @@
import { Duplex } from "node:stream";
import Extractor from "./Extractor";
import { ParseOne } from "unzipper";
export default class ZipExtractor implements Extractor {
public static instance: ZipExtractor = new ZipExtractor();
public extract(options: { file: string }): Duplex {
return ParseOne(new RegExp(options.file), {
forceStream: true,
});
}
}
-13
View File
@@ -1,13 +0,0 @@
interface Data {
id?: number;
}
class InvalidData extends Error {
constructor(message: string) {
super(message);
this.name = "Invalid data";
}
}
export default Data;
export { InvalidData };
-53
View File
@@ -1,53 +0,0 @@
import { Data, InvalidData } from "./";
import { Column, Entity, PrimaryGeneratedColumn } from "typeorm";
type RawNudgerData = {
code: string; // "3260014791012",
brand: string; // "ALSATEK",
model: string; // "TL33171",
name: string; // "alsatek lg g3 coque protection aluminium rouge bumper tl33171",
last_updated: string; // "1562430134146",
gs1_country: string; // "FR",
offers_count: string; // "0",
min_price: string; // "",
min_price_compensation: string; // "",
currency: string; // "",
categories: string; // "ACCESSOIRES>COQUE SMARTPHONE",
url: string; // ""
};
@Entity()
class NudgerData implements Data {
@PrimaryGeneratedColumn({
type: "integer",
})
id?: number;
@Column()
barcode_ean_13: string;
@Column()
country: string;
constructor(code: string, gs1_country: string) {
this.barcode_ean_13 = code;
this.country = gs1_country;
}
fromRaw({ code, gs1_country }: RawNudgerData): NudgerData {
if (!code || !gs1_country || code.length !== 13) {
throw new InvalidData("Invalid data");
}
return new NudgerData(code, gs1_country);
}
asData(nudgerData: NudgerData): any {
return {
"Barcode (EAN 13)": nudgerData.barcode_ean_13,
Country: nudgerData.country,
};
}
}
export default NudgerData;
-46
View File
@@ -1,46 +0,0 @@
import { Data, InvalidData } from "./";
import { Column, Entity, PrimaryColumn, PrimaryGeneratedColumn } from "typeorm";
type RawOpenFoodFactsData = {
code: string;
countries_en: string;
};
@Entity()
class OpenFoodFactsData implements Data {
@PrimaryGeneratedColumn({
type: "integer",
})
id?: number;
@Column()
barcode_ean_13: string;
@Column()
country: string;
constructor(code: string, gs1_country: string) {
this.barcode_ean_13 = code;
this.country = gs1_country;
}
fromRaw({
code,
countries_en,
}: RawOpenFoodFactsData): OpenFoodFactsData {
if (!code || !countries_en || code.length !== 13) {
throw new InvalidData("Invalid data");
}
return new OpenFoodFactsData(code, countries_en);
}
asData(openData: OpenFoodFactsData): any {
return {
"Barcode (EAN 13)": openData.barcode_ean_13,
Country: openData.country,
};
}
}
export default OpenFoodFactsData;
-45
View File
@@ -1,45 +0,0 @@
import { Data, InvalidData } from "./";
import { Column, Entity, PrimaryGeneratedColumn } from "typeorm";
type RawWorldCitiesData = {
name: string;
country: string;
subcountry: string;
geonameid: string;
};
@Entity()
class WorldCitiesData implements Data {
@PrimaryGeneratedColumn({
type: "integer",
})
id?: number;
@Column()
geoname_id: string;
@Column()
country: string;
constructor(geonameId: string, country: string) {
this.geoname_id = geonameId;
this.country = country;
}
fromRaw({ geonameid, country }: RawWorldCitiesData): WorldCitiesData {
if (!geonameid || !country || geonameid.length !== 6) {
throw new InvalidData("Invalid data");
}
return new WorldCitiesData(geonameid, country);
}
asData(worldCitiesData: WorldCitiesData): any {
return {
"Geoname ID": worldCitiesData.geoname_id,
Country: worldCitiesData.country,
};
}
}
export default WorldCitiesData;
-5
View File
@@ -1,5 +0,0 @@
export { default as Data, InvalidData } from "./Data";
export { default as NudgerData } from "./NudgerData";
export { default as OpenFoodFactsData } from "./OpenFoodFactsData";
export { default as WorldCitiesData } from "./WorldCitiesData";
@@ -0,0 +1,35 @@
import { Transform } from "node:stream";
type NudgerData = {
code: string; // "3260014791012",
brand: string; // "ALSATEK",
model: string; // "TL33171",
name: string; // "alsatek lg g3 coque protection aluminium rouge bumper tl33171",
last_updated: string; // "1562430134146",
gs1_country: string; // "FR",
offers_count: string; // "0",
min_price: string; // "",
min_price_compensation: string; // "",
currency: string; // "",
categories: string; // "ACCESSOIRES>COQUE SMARTPHONE",
url: string; // ""
};
export default class NudgerDataTransformer extends Transform {
constructor() {
super({ objectMode: true });
}
_transform(chunk: NudgerData, encoding: string, callback: () => void) {
const { code, gs1_country } = chunk;
if (code && gs1_country) {
this.push({
"Barcode (EAN 13)": code,
Country: gs1_country,
});
}
callback();
}
}
@@ -0,0 +1,25 @@
import { Transform } from "node:stream";
type OpenfoodfactsData = {
code: string;
countries_en: string;
};
export default class OpenfoodfactsDataTransformer extends Transform {
constructor() {
super({ objectMode: true });
}
_transform(chunk: OpenfoodfactsData, encoding: string, callback: () => void) {
const { code, countries_en } = chunk;
if (code && countries_en) {
this.push({
"Barcode (EAN 13)": code,
Country: countries_en,
});
}
callback();
}
}
@@ -0,0 +1,27 @@
import { Transform } from "node:stream";
type WorldCitiesData = {
name: string;
country: string;
subcountry: string;
geonameid: string;
};
export default class WorldCitiesDataTransformer extends Transform {
constructor() {
super({ objectMode: true });
}
_transform(chunk: WorldCitiesData, encoding: string, callback: () => void) {
const { geonameid, country } = chunk;
if (geonameid && country) {
this.push({
"Geoname ID": geonameid,
Country: country,
});
}
callback();
}
}
+105 -168
View File
@@ -1,72 +1,67 @@
import { pipeline, Transform, Writable } from "node:stream"; import { pipeline, Transform, Writable } from "node:stream";
import { promisify } from "node:util"; import { promisify } from "node:util";
import { Validator } from "jsonschema";
import FileService from "../FileService"; import FileService from "../FileService";
import { ArchiveFactory, ArchiveType } from "../archive";
import { ParserFactory, ParserType } from "../parser"; import { ParserFactory, ParserType } from "../parser";
import { Data, InvalidData} from "../data"; import ExtractorFactory, {
import { AppDataSource } from "../../AppDataSource"; ExtractorType,
import { EntityManager, EntityTarget, Repository } from "typeorm"; } from "../archive_extractor/ExtractorFactory";
import Extractor from "../archive_extractor/Extractor";
type DatasetParams = { import Parser from "../parser/Parser";
id: string; import { getDatabaseConnexion } from "../DataLake";
dataConstructor: (params: any) => Data; import { validate } from "jsonschema";
dataType: Data;
source: string;
file: string;
archiveType: ArchiveType;
parserType: ParserType;
options?: DatasetOptions;
};
type DatasetOptions = {
parser?: any;
};
/** /**
* Represents a dataset that can be loaded and queried * Represents a dataset that can be loaded and queried
*/ */
class Dataset<D extends Data> { export default class Dataset {
readonly id: string; readonly id: string;
readonly source: string; readonly uri: string;
readonly file: string; readonly endpoint: string;
readonly archiveType: ArchiveType;
readonly parserType: ParserType; private extractor: Extractor = ExtractorFactory.getExtractor(
readonly dataConstructor: (params: any) => Data; ExtractorType.NONE
readonly dataType: Data; );
private options?: DatasetOptions; private parser: Parser = ParserFactory.getParser(ParserType.CSV);
private extractorOptions: any;
private parserOptions: any;
private dataTransformer?: Transform;
/** /**
* Create a new dataset instance * Create a new dataset instance
* @param id - The unique identifier of the dataset * @param id - The unique identifier of the dataset
* @param source - The URL of the dataset * @param source - The URL of the dataset
* @param file - The name of the file in the archive
* @param dataType - The constructor of the data class
* @param archiveType - The type of the archive
* @param dataConstructor - The type of the dataset
* @param parserType
* @param options - Additional options for the dataset
*/ */
constructor({ constructor({
id, id,
source, uri,
file, endpoint,
dataConstructor, }: {
dataType, id: string;
archiveType, uri: string;
parserType, endpoint: string;
options, }) {
}: DatasetParams) {
this.id = id; this.id = id;
this.dataConstructor = dataConstructor; this.uri = uri;
this.source = source; this.endpoint = endpoint;
this.file = file; }
this.dataType = dataType;
this.archiveType = archiveType; setExtractor(type: ExtractorType, options: any): this {
this.parserType = parserType; this.extractor = ExtractorFactory.getExtractor(type);
this.options = options; this.extractorOptions = options;
return this;
}
setParser(type: ParserType, options: any): this {
this.parser = ParserFactory.getParser(type);
this.parserOptions = options;
return this;
}
setDataTransformer(dataTransformer: Transform): this {
this.dataTransformer = dataTransformer;
return this;
} }
/** /**
@@ -75,139 +70,81 @@ class Dataset<D extends Data> {
* @throws {Error} - If the dataset cannot be loaded * @throws {Error} - If the dataset cannot be loaded
*/ */
public async load(): Promise<void> { public async load(): Promise<void> {
// const repository: Repository<T> = AppDataSource.getRepository<T>(Data); if (!this.dataTransformer) {
throw new Error("Data transformer is not set");
}
// if ((await repository.count()) > 0) { const db = await getDatabaseConnexion();
// console.log(`Already cached: ${this.source}`); const collection = db.collection(this.id);
// return; const count = await collection.countDocuments();
// } if (count > 0) {
console.log(`Dataset ${this.id} already loaded`);
const archive = ArchiveFactory.getArchive(this.archiveType); return;
const parser = ParserFactory.getParser(this.parserType); }
const pipelineAsync = promisify(pipeline); const pipelineAsync = promisify(pipeline);
console.log(`Download: ${this.source}`); console.log(`Download: ${this.uri}`);
const datasetId = this.id;
const batch = 1000;
const buffer: any[] = [];
// Start transaction
await AppDataSource.manager.transaction(async (manager) => {
await pipelineAsync( await pipelineAsync(
await FileService.getFileStream(this.source), await FileService.getFileStream(this.uri),
archive.extract(this.file), this.extractor.extract(this.extractorOptions),
parser.parse(this.options?.parser), this.parser.parse(this.parserOptions),
Dataset.transformToData(this.dataConstructor, manager), this.dataTransformer,
new Writable({ new Writable({
objectMode: true, objectMode: true,
write(chunk, _, callback) { async write(chunk, _, callback) {
callback(); buffer.push(chunk);
if (buffer.length < batch) return callback();
const db = await getDatabaseConnexion();
db.collection(datasetId)
.insertMany(buffer.splice(0, batch))
.then(() => callback())
.catch((error) => callback(error));
}, },
}) })
) )
.then(() => { .then(async () => {
console.log(`Loaded: ${this.source}`); if (buffer.length > 0) {
await db.collection(datasetId).insertMany(buffer);
}
}) })
.catch((err) => { .then(() => console.log(`Dataset ${this.id} loaded`))
console.error(`Failed to load dataset: ${this.source}`); .catch((error) =>
throw err; console.error(`Error loading dataset ${this.id}: ${error.message}`)
});
});
}
private static transformToData(
dataType: (params: any) => Data,
manager: EntityManager
): Transform {
return new Transform({
objectMode: true,
async transform(chunk: object, _, callback) {
try {
const data: Data = dataType(chunk);
await manager.save(data);
callback(null, JSON.stringify(data) + "\n");
} catch (err: any) {
if (err instanceof InvalidData) {
callback(null, "");
} else callback(err);
}
},
});
}
/**
* Get a number of data entries from the dataset
* @param length - The number of data entries to get (default: 10)
* @param schema - Schema of the expected data returned
*/
public async get(length: number = 10, schema: {}): Promise<any[]> {
const dataRepository = AppDataSource.manager.getRepository<T>(
this.dataType as EntityTarget<T>
); );
const datas = await dataRepository
.createQueryBuilder("data")
.orderBy("RAND()") // Fonction RAND() pour randomiser l'ordre
.limit(length) // Limiter le nombre de résultats
.getMany();
return new Promise((resolve, reject) => {
let count: number = 0;
const results: Data[] = [];
const validator = new Validator();
datas.forEach((data) => {
let randomizedData = D.fromRaw(data);
// this.dataConstructor(data);
if (validator.validate(randomizedData, schema)) {
results.push(randomizedData);
count++;
} }
});
return resolve(results); async get(length: number = 10, schema: any) {
}); const db = await getDatabaseConnexion();
// //
// // const stream = fs.createReadStream(this.cachePath, { encoding: "utf8" }); // Convert JSON schema to projection
// // const rl = readline.createInterface({ const projection = { _id: 0 };
// // input: stream, if (schema?.properties) {
// // crlfDelay: Infinity, for (const field in schema.properties) {
// // }); // @ts-ignore
// // projection[field] = 1;
// //
// // rl.on("line", (line) => {
// // if (count < length) {
// // const data: Data = JSON.parse(line) as Data;
// // if (validator.validate(data, schema)) {
// // results.push(data);
// // count++;
// // }
// //
// // // // Pour chaque objet, récupérer l'objet et vérifier que le schéma est valide
// //
// // // schema.input?.forEach((input: string, index: number) => {
// // // obj[input] = data.input[index];
// // // });
// // // schema.output?.forEach((output, index) => {
// // // obj[output] = data.output[index];
// // // });
// //
// // // // Add the object to the results
// // // count++;
// // } else {
// // rl.close(); // Fermer le flux si on a atteint les n objets
// // }
// // });
// //
// // // Quand le flux est terminé ou a été fermé.
// // rl.on("close", () => {
// // resolve(results); // Renvoie les n objets lus
// // });
// //
// // // Gérer les erreurs du flux de lecture
// // rl.on("error", (err) => {
// // reject(err);
// // });
// // });
} }
} }
export default Dataset; const datas = await db
.collection(this.id)
.aggregate([{ $project: projection }, { $sample: { size: length } }])
.limit(length)
.toArray();
return datas
.map((data) => {
const res = validate(data, schema);
if (!res.valid) return null;
return data;
})
.filter((data) => data !== null);
}
}
+47 -43
View File
@@ -1,56 +1,60 @@
import { Data, NudgerData, OpenFoodFactsData, WorldCitiesData } from "../data";
import { ArchiveType } from "../archive";
import { Dataset } from "./"; import { Dataset } from "./";
import { ParserType } from "../parser"; import { ParserType } from "../parser";
import WorldCitiesDataTransformer from "../data_transformer/WorldCitiesDataTransformer";
import { ExtractorType } from "../archive_extractor/ExtractorFactory";
import NudgerDataTransformer from "../data_transformer/NudgerDataTransformer";
import OpenfoodfactsDataTransformer from "../data_transformer/OpenfoodfactsDataTransformer";
import axios from "axios";
class DatasetCollection { class DatasetCollection {
public static datasets: Dataset<Data>[] = [ public static datasets: Dataset[] = [
// new Dataset({ new Dataset({
// id: "nudger", id: "nudger",
// source: uri: "https://files.opendatarchives.fr/data.cquest.org/open4goods/gtin-open-data.zip",
// "https://files.opendatarchives.fr/data.cquest.org/open4goods/gtin-open-data.zip", endpoint: "http://localhost:4321/randomize/nudger",
// file: "open4goods-full-gtin-dataset.csv", })
// dataConstructor: NudgerData.fromRaw, .setExtractor(ExtractorType.ZIP, {
// dataType: NudgerData, file: "open4goods-full-gtin-dataset.csv",
// archiveType: ArchiveType.ZIP, })
// parserType: ParserType.CSV, .setParser(ParserType.CSV, {})
// options: { .setDataTransformer(new NudgerDataTransformer()),
// parser: { new Dataset({
// delimiter: ",", id: "openfoodfacts",
// }, uri: "https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv.gz",
// }, endpoint: "http://localhost:4321/randomize/openfoodfacts",
// }), })
// new Dataset({ .setExtractor(ExtractorType.GZIP, {
// id: "openfoodfacts", file: "en.openfoodfacts.org.products.csv",
// source: })
// "https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv.gz", .setParser(ParserType.CSV, { delimiter: "\t", quote: null })
// file: "en.openfoodfacts.org.products.csv", .setDataTransformer(new OpenfoodfactsDataTransformer()),
// dataConstructor: OpenFoodFactsData.fromRaw, new Dataset({
// dataType: OpenFoodFactsData,
// archiveType: ArchiveType.GZIP,
// parserType: ParserType.CSV,
// options: {
// parser: {
// delimiter: "\t",
// quote: null,
// },
// },
// }),
new Dataset<WorldCitiesData>({
id: "world-cities", id: "world-cities",
source: uri: "https://raw.githubusercontent.com/datasets/world-cities/refs/heads/main/data/world-cities.csv",
"https://raw.githubusercontent.com/datasets/world-cities/refs/heads/main/data/world-cities.csv", endpoint: "http://localhost:4321/randomize/world-cities",
file: "world-cities.csv", })
dataConstructor: WorldCitiesData.fromRaw, .setExtractor(ExtractorType.NONE, {})
dataType: WorldCitiesData, .setParser(ParserType.CSV, {})
archiveType: ArchiveType.NONE, .setDataTransformer(new WorldCitiesDataTransformer()),
parserType: ParserType.CSV,
}),
]; ];
public static loadAll(): Promise<void[]> { public static loadAll(): Promise<void[]> {
return Promise.all(this.datasets.map((dataset) => dataset.load())); return Promise.all(this.datasets.map((dataset) => dataset.load()));
} }
public static async getDatasetByMatchingSchema(
schema: any
): Promise<string[]> {
return await Promise.all(
this.datasets.map((dataset) =>
axios
.post(dataset.endpoint, schema, {
params: { size: 1 },
})
.then((res) => (res.data.data.length > 0 ? dataset.endpoint : null))
)
).then((endpoints) => endpoints.filter((endpoint) => endpoint !== null));
}
} }
export default DatasetCollection; export default DatasetCollection;