From 4f9b4f9a1760f5ff85e2d9740099d337ac090020 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Fri, 4 Oct 2024 01:26:34 +0200 Subject: [PATCH] CTK: Improve transformations for MongoDB Table Loader --- application/cratedb-toolkit/requirements.txt | 2 +- application/cratedb-toolkit/test_io.py | 6 ++ .../zyp-mongodb-json-files.yaml | 82 +++++++++++++++++-- 3 files changed, 82 insertions(+), 8 deletions(-) diff --git a/application/cratedb-toolkit/requirements.txt b/application/cratedb-toolkit/requirements.txt index 723596be..2501fcb2 100644 --- a/application/cratedb-toolkit/requirements.txt +++ b/application/cratedb-toolkit/requirements.txt @@ -1 +1 @@ -cratedb-toolkit[influxdb,mongodb]==0.0.27 +cratedb-toolkit[influxdb,mongodb]==0.0.29 diff --git a/application/cratedb-toolkit/test_io.py b/application/cratedb-toolkit/test_io.py index efb727dc..dbe57303 100644 --- a/application/cratedb-toolkit/test_io.py +++ b/application/cratedb-toolkit/test_io.py @@ -134,6 +134,12 @@ def test_ctk_load_table_mongodb_json(drop_testing_tables): progress=GitProgressPrinter(), ) + # The `countries-big.json` file contains bogus characters. + countries_big_path = datasets_path / "countries-big.json" + payload = countries_big_path.read_text() + payload = payload.replace("\ufeff", "") + countries_big_path.write_text(payload) + # Invoke data transfer. command = f""" ctk load table \ diff --git a/application/cratedb-toolkit/zyp-mongodb-json-files.yaml b/application/cratedb-toolkit/zyp-mongodb-json-files.yaml index 31605164..e1d28c1b 100644 --- a/application/cratedb-toolkit/zyp-mongodb-json-files.yaml +++ b/application/cratedb-toolkit/zyp-mongodb-json-files.yaml @@ -38,11 +38,79 @@ meta: type: zyp-project version: 1 + collections: -- address: - container: datasets - name: companies - pre: - rules: - - expression: .[] |= del(.image.available_sizes, .screenshots[].available_sizes) - type: jq + + - address: + container: datasets + name: books + pre: + rules: + - expression: .[] |= (._id |= tostring) + type: jq + + - address: + container: datasets + name: city_inspections + pre: + rules: + - expression: | + .[] |= ( + select(true) + | .address.number |= numbers + | .address.zip |= numbers + | .certificate_number |= tostring + ) + type: jq + + - address: + container: datasets + name: companies + pre: + rules: + - expression: | + .[] |= + del( + .image.available_sizes, + .screenshots[].available_sizes, + .created_at + ) + type: jq + + - address: + container: datasets + name: countries-big + pre: + rules: + - expression: .[] |= (.ISO |= tostring) + type: jq + + - address: + container: datasets + name: products + pre: + rules: + - expression: | + .[] |= ( + select(true) + | if (.for) then .for |= to_array end + | if (.type) then .type |= to_array end + | if (.limits.data.n) then .limits.data.n |= tostring end + | if (.limits.sms.n) then .limits.sms.n |= tostring end + | if (.limits.voice.n) then .limits.voice.n |= tostring end + | del(.additional_tarriffs) + ) + type: jq + + - address: + container: datasets + name: restaurant + pre: + rules: + - expression: | + .[] |= ( + select(true) + | .rating |= tostring + | .type |= to_array + ) + type: jq