From 453a1d9d44e76c3e641e7f9ccfdb1a34cacc0c17 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Fri, 25 Oct 2024 09:01:33 +0100 Subject: [PATCH 1/5] docs: instructions for running zipper part of app --- .env-example | 2 ++ README.md | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/.env-example b/.env-example index 4529794..968ba9d 100644 --- a/.env-example +++ b/.env-example @@ -11,6 +11,8 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72 +ZIP_WORKING_DIR=/tmp/bulk-data-service-zip + # Sample local setup - values read by docker compose (for simple Postgres DB # creation), and used by the app DB_NAME=bulk_data_service_db diff --git a/README.md b/README.md index 3a294d9..35d365a 100644 --- a/README.md +++ b/README.md @@ -59,12 +59,21 @@ docker compose up The example `.env` file (`.env-example`) is configured to use the above docker compose setup. If you don't use the docker compose setup, then you will need to change the values in the `.env` file accordingly. -Once the docker compose setup is running, start the bulk download app with: +Once the docker compose setup is running, you can run the dataset updater part of the app with (this will download the datasets and upload them to Azurite): ``` dotenv run python src/iati_bulk_data_service.py -- --operation checker --single-run --run-for-n-datasets=50 ``` +You can run the zipper operation with: + +``` +dotenv run python src/iati_bulk_data_service.py -- --operation zipper --single-run +``` + +It will store the ZIP files in the directory defined in the `ZIP_WORKING_DIR` environment variable. + + *Note:* not all versions of `dotenv` require a `run` subcommand. ## Development on the app From 52cc2b55c89e18a0f0bd5de0e8a9fdccdfb72b67 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Fri, 25 Oct 2024 09:08:30 +0100 Subject: [PATCH 2/5] style: linting * Linting that was previously missed --- src/bulk_data_service/dataset_updater.py | 3 ++- src/config/config.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/bulk_data_service/dataset_updater.py b/src/bulk_data_service/dataset_updater.py index ce76cbd..e6be614 100644 --- a/src/bulk_data_service/dataset_updater.py +++ b/src/bulk_data_service/dataset_updater.py @@ -214,7 +214,8 @@ def check_dataset_etag_last_mod_header( "return non-200 status. {} " "HEAD request exception details: {}".format(download_within_hours, extra_err_message, e) ) - } | e.args[0] + } + | e.args[0] ) context["logger"].warning("dataset id: {} - {}".format(bds_dataset["id"], bds_dataset["head_error_message"])) diff --git a/src/config/config.py b/src/config/config.py index 28908e4..f96a15e 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -20,7 +20,7 @@ "AZURE_STORAGE_CONNECTION_STRING", "AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML", "AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP", - "CHECKER_LOOP_WAIT_MINS" + "CHECKER_LOOP_WAIT_MINS", ] From 6225e6164130d115a10266eccc2cc911a9847d51 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Fri, 25 Oct 2024 09:11:35 +0100 Subject: [PATCH 3/5] test: tests for index creation on failed downloads * Two new tests to check the content of the index entries for datasets which fail to download. --- tests/integration/test_dataset_indexing.py | 92 ++++++++++++++++++++-- 1 file changed, 86 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_dataset_indexing.py b/tests/integration/test_dataset_indexing.py index 25a5dcf..ebe14af 100644 --- a/tests/integration/test_dataset_indexing.py +++ b/tests/integration/test_dataset_indexing.py @@ -4,12 +4,12 @@ from azure.storage.blob import BlobServiceClient from bulk_data_service.checker import checker_run -from bulk_data_service.dataset_indexing import get_full_index_fields, get_index_name, get_minimal_index_fields +from bulk_data_service.dataset_indexing import get_full_index_source_fields, get_index_name, get_minimal_index_fields from helpers.helpers import get_and_clear_up_context # noqa: F401 from utilities.azure import get_azure_container_name -def test_index_creation(get_and_clear_up_context): # noqa: F811 +def test_index_uploaded_to_blob_storage(get_and_clear_up_context): # noqa: F811 context = get_and_clear_up_context @@ -32,7 +32,7 @@ def test_index_creation(get_and_clear_up_context): # noqa: F811 blob_service_client.close() -def test_minimal_index_creation(get_and_clear_up_context): # noqa: F811 +def test_minimal_index_creation_for_download_success(get_and_clear_up_context): # noqa: F811 context = get_and_clear_up_context context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01" @@ -64,7 +64,45 @@ def test_minimal_index_creation(get_and_clear_up_context): # noqa: F811 blob_service_client.close() -def test_full_index_creation(get_and_clear_up_context): # noqa: F811 +def test_minimal_index_creation_for_download_failure(get_and_clear_up_context): # noqa: F811 + context = get_and_clear_up_context + + context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03" + datasets_in_bds = {} + checker_run(context, datasets_in_bds) + + blob_service_client = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"]) + + zip_container_name = get_azure_container_name(context, "zip") + + minimal_index_name = get_index_name(context, "minimal") + + minimal_index_blob = blob_service_client.get_blob_client(zip_container_name, minimal_index_name) + + minimal_index = json.loads(minimal_index_blob.download_blob().readall()) + + dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")] + + assert dataset["name"] in minimal_index["datasets"] + + index_item = minimal_index["datasets"][dataset["name"]] + + for field in get_minimal_index_fields(context): + if isinstance(dataset[field], uuid.UUID): + assert uuid.UUID(index_item[field]) == dataset[field] + elif dataset[field] is None or isinstance(dataset[field], str): + assert index_item[field] == dataset[field] + + assert index_item["url_xml"] is None + assert index_item["url_zip"] is None + + # -2 because of the two autogenerated fields + assert len(index_item.keys()) - 2 == len(get_minimal_index_fields(context)) + + blob_service_client.close() + + +def test_full_index_creation_for_download_success(get_and_clear_up_context): # noqa: F811 context = get_and_clear_up_context context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01" @@ -87,13 +125,55 @@ def test_full_index_creation(get_and_clear_up_context): # noqa: F811 full_index_item = full_index["datasets"][dataset["name"]] - for field in get_full_index_fields(context): + for field in get_full_index_source_fields(context): + if field == "download_error_message" or field == "head_error_message": + continue if isinstance(dataset[field], uuid.UUID): assert uuid.UUID(full_index_item[field]) == dataset[field] elif isinstance(dataset[field], str): assert full_index_item[field] == dataset[field] # -2 because of the two autogenerated fields - assert len(full_index_item.keys()) - 2 == len(get_full_index_fields(context)) + assert len(full_index_item.keys()) - 2 == len(get_full_index_source_fields(context)) + + blob_service_client.close() + + +def test_full_index_creation_for_download_failure(get_and_clear_up_context): # noqa: F811 + context = get_and_clear_up_context + + context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03" + datasets_in_bds = {} + checker_run(context, datasets_in_bds) + + blob_service_client = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"]) + + zip_container_name = get_azure_container_name(context, "zip") + + full_index_name = get_index_name(context, "full") + + full_index_blob = blob_service_client.get_blob_client(zip_container_name, full_index_name) + + full_index = json.loads(full_index_blob.download_blob().readall()) + + dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")] + + assert dataset["name"] in full_index["datasets"] + + full_index_item = full_index["datasets"][dataset["name"]] + + for field in get_full_index_source_fields(context): + if field == "download_error_message" or field == "head_error_message": + continue + if isinstance(dataset[field], uuid.UUID): + assert uuid.UUID(full_index_item[field]) == dataset[field] + elif dataset[field] is None or isinstance(dataset[field], str): + assert full_index_item[field] == dataset[field] + + assert full_index_item["url_xml"] is None + assert full_index_item["url_zip"] is None + + # -2 because of the two autogenerated fields + assert len(full_index_item.keys()) - 2 == len(get_full_index_source_fields(context)) blob_service_client.close() From bec41bf75c867600953a7fb54e1d044bdaad620c Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Fri, 25 Oct 2024 09:25:37 +0100 Subject: [PATCH 4/5] fix: output null in index when no dataset download * output null instead of empty string when there is no cached download available for the dataset * rename function that provides list of fields to better describe its purpose --- src/bulk_data_service/dataset_indexing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bulk_data_service/dataset_indexing.py b/src/bulk_data_service/dataset_indexing.py index 1a549cf..a8a9f5f 100644 --- a/src/bulk_data_service/dataset_indexing.py +++ b/src/bulk_data_service/dataset_indexing.py @@ -58,8 +58,8 @@ def get_index_entry(context: dict, dataset: dict, index_type: str) -> dict[str, else: dataset_index_entry = get_full_index_entry_from_dataset(context, dataset) - dataset_index_entry["url_xml"] = "" - dataset_index_entry["url_zip"] = "" + dataset_index_entry["url_xml"] = None + dataset_index_entry["url_zip"] = None if dataset_index_entry["last_successful_download"] is not None: dataset_index_entry["url_xml"] = get_azure_blob_public_url(context, dataset, "xml") @@ -80,7 +80,7 @@ def get_minimal_index_entry_from_dataset(context: dict, dataset: dict) -> dict: def get_full_index_entry_from_dataset(context: dict, dataset: dict) -> dict: - full_index_entry = {k: v for k, v in dataset.items() if k in get_full_index_fields(context)} + full_index_entry = {k: v for k, v in dataset.items() if k in get_full_index_source_fields(context)} field_from_json_str_to_object(full_index_entry, "download_error_message", "download_error_details") @@ -96,7 +96,7 @@ def field_from_json_str_to_object(entry: dict, source_field: str, dest_field: st del entry[source_field] -def get_full_index_fields(context: dict) -> list[str]: +def get_full_index_source_fields(context: dict) -> list[str]: return [ "id", "name", From 02d649ca82fe34b54a23c3e06a904701745299ad Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Fri, 25 Oct 2024 09:25:51 +0100 Subject: [PATCH 5/5] build: bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f982c49..1971064 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "bulk-data-service" -version = "0.0.1" +version = "0.1.6" requires-python = ">= 3.12" readme = "README.md" dependencies = [