diff --git a/.env-example b/.env-example index 4529794..968ba9d 100644 --- a/.env-example +++ b/.env-example @@ -11,6 +11,8 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72 +ZIP_WORKING_DIR=/tmp/bulk-data-service-zip + # Sample local setup - values read by docker compose (for simple Postgres DB # creation), and used by the app DB_NAME=bulk_data_service_db diff --git a/README.md b/README.md index 3a294d9..35d365a 100644 --- a/README.md +++ b/README.md @@ -59,12 +59,21 @@ docker compose up The example `.env` file (`.env-example`) is configured to use the above docker compose setup. If you don't use the docker compose setup, then you will need to change the values in the `.env` file accordingly. -Once the docker compose setup is running, start the bulk download app with: +Once the docker compose setup is running, you can run the dataset updater part of the app with (this will download the datasets and upload them to Azurite): ``` dotenv run python src/iati_bulk_data_service.py -- --operation checker --single-run --run-for-n-datasets=50 ``` +You can run the zipper operation with: + +``` +dotenv run python src/iati_bulk_data_service.py -- --operation zipper --single-run +``` + +It will store the ZIP files in the directory defined in the `ZIP_WORKING_DIR` environment variable. + + *Note:* not all versions of `dotenv` require a `run` subcommand. ## Development on the app diff --git a/pyproject.toml b/pyproject.toml index f982c49..1971064 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "bulk-data-service" -version = "0.0.1" +version = "0.1.6" requires-python = ">= 3.12" readme = "README.md" dependencies = [ diff --git a/src/bulk_data_service/dataset_indexing.py b/src/bulk_data_service/dataset_indexing.py index 1a549cf..a8a9f5f 100644 --- a/src/bulk_data_service/dataset_indexing.py +++ b/src/bulk_data_service/dataset_indexing.py @@ -58,8 +58,8 @@ def get_index_entry(context: dict, dataset: dict, index_type: str) -> dict[str, else: dataset_index_entry = get_full_index_entry_from_dataset(context, dataset) - dataset_index_entry["url_xml"] = "" - dataset_index_entry["url_zip"] = "" + dataset_index_entry["url_xml"] = None + dataset_index_entry["url_zip"] = None if dataset_index_entry["last_successful_download"] is not None: dataset_index_entry["url_xml"] = get_azure_blob_public_url(context, dataset, "xml") @@ -80,7 +80,7 @@ def get_minimal_index_entry_from_dataset(context: dict, dataset: dict) -> dict: def get_full_index_entry_from_dataset(context: dict, dataset: dict) -> dict: - full_index_entry = {k: v for k, v in dataset.items() if k in get_full_index_fields(context)} + full_index_entry = {k: v for k, v in dataset.items() if k in get_full_index_source_fields(context)} field_from_json_str_to_object(full_index_entry, "download_error_message", "download_error_details") @@ -96,7 +96,7 @@ def field_from_json_str_to_object(entry: dict, source_field: str, dest_field: st del entry[source_field] -def get_full_index_fields(context: dict) -> list[str]: +def get_full_index_source_fields(context: dict) -> list[str]: return [ "id", "name", diff --git a/src/bulk_data_service/dataset_updater.py b/src/bulk_data_service/dataset_updater.py index ce76cbd..e6be614 100644 --- a/src/bulk_data_service/dataset_updater.py +++ b/src/bulk_data_service/dataset_updater.py @@ -214,7 +214,8 @@ def check_dataset_etag_last_mod_header( "return non-200 status. {} " "HEAD request exception details: {}".format(download_within_hours, extra_err_message, e) ) - } | e.args[0] + } + | e.args[0] ) context["logger"].warning("dataset id: {} - {}".format(bds_dataset["id"], bds_dataset["head_error_message"])) diff --git a/src/config/config.py b/src/config/config.py index 28908e4..f96a15e 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -20,7 +20,7 @@ "AZURE_STORAGE_CONNECTION_STRING", "AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML", "AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP", - "CHECKER_LOOP_WAIT_MINS" + "CHECKER_LOOP_WAIT_MINS", ] diff --git a/tests/integration/test_dataset_indexing.py b/tests/integration/test_dataset_indexing.py index 25a5dcf..ebe14af 100644 --- a/tests/integration/test_dataset_indexing.py +++ b/tests/integration/test_dataset_indexing.py @@ -4,12 +4,12 @@ from azure.storage.blob import BlobServiceClient from bulk_data_service.checker import checker_run -from bulk_data_service.dataset_indexing import get_full_index_fields, get_index_name, get_minimal_index_fields +from bulk_data_service.dataset_indexing import get_full_index_source_fields, get_index_name, get_minimal_index_fields from helpers.helpers import get_and_clear_up_context # noqa: F401 from utilities.azure import get_azure_container_name -def test_index_creation(get_and_clear_up_context): # noqa: F811 +def test_index_uploaded_to_blob_storage(get_and_clear_up_context): # noqa: F811 context = get_and_clear_up_context @@ -32,7 +32,7 @@ def test_index_creation(get_and_clear_up_context): # noqa: F811 blob_service_client.close() -def test_minimal_index_creation(get_and_clear_up_context): # noqa: F811 +def test_minimal_index_creation_for_download_success(get_and_clear_up_context): # noqa: F811 context = get_and_clear_up_context context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01" @@ -64,7 +64,45 @@ def test_minimal_index_creation(get_and_clear_up_context): # noqa: F811 blob_service_client.close() -def test_full_index_creation(get_and_clear_up_context): # noqa: F811 +def test_minimal_index_creation_for_download_failure(get_and_clear_up_context): # noqa: F811 + context = get_and_clear_up_context + + context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03" + datasets_in_bds = {} + checker_run(context, datasets_in_bds) + + blob_service_client = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"]) + + zip_container_name = get_azure_container_name(context, "zip") + + minimal_index_name = get_index_name(context, "minimal") + + minimal_index_blob = blob_service_client.get_blob_client(zip_container_name, minimal_index_name) + + minimal_index = json.loads(minimal_index_blob.download_blob().readall()) + + dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")] + + assert dataset["name"] in minimal_index["datasets"] + + index_item = minimal_index["datasets"][dataset["name"]] + + for field in get_minimal_index_fields(context): + if isinstance(dataset[field], uuid.UUID): + assert uuid.UUID(index_item[field]) == dataset[field] + elif dataset[field] is None or isinstance(dataset[field], str): + assert index_item[field] == dataset[field] + + assert index_item["url_xml"] is None + assert index_item["url_zip"] is None + + # -2 because of the two autogenerated fields + assert len(index_item.keys()) - 2 == len(get_minimal_index_fields(context)) + + blob_service_client.close() + + +def test_full_index_creation_for_download_success(get_and_clear_up_context): # noqa: F811 context = get_and_clear_up_context context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01" @@ -87,13 +125,55 @@ def test_full_index_creation(get_and_clear_up_context): # noqa: F811 full_index_item = full_index["datasets"][dataset["name"]] - for field in get_full_index_fields(context): + for field in get_full_index_source_fields(context): + if field == "download_error_message" or field == "head_error_message": + continue if isinstance(dataset[field], uuid.UUID): assert uuid.UUID(full_index_item[field]) == dataset[field] elif isinstance(dataset[field], str): assert full_index_item[field] == dataset[field] # -2 because of the two autogenerated fields - assert len(full_index_item.keys()) - 2 == len(get_full_index_fields(context)) + assert len(full_index_item.keys()) - 2 == len(get_full_index_source_fields(context)) + + blob_service_client.close() + + +def test_full_index_creation_for_download_failure(get_and_clear_up_context): # noqa: F811 + context = get_and_clear_up_context + + context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03" + datasets_in_bds = {} + checker_run(context, datasets_in_bds) + + blob_service_client = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"]) + + zip_container_name = get_azure_container_name(context, "zip") + + full_index_name = get_index_name(context, "full") + + full_index_blob = blob_service_client.get_blob_client(zip_container_name, full_index_name) + + full_index = json.loads(full_index_blob.download_blob().readall()) + + dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")] + + assert dataset["name"] in full_index["datasets"] + + full_index_item = full_index["datasets"][dataset["name"]] + + for field in get_full_index_source_fields(context): + if field == "download_error_message" or field == "head_error_message": + continue + if isinstance(dataset[field], uuid.UUID): + assert uuid.UUID(full_index_item[field]) == dataset[field] + elif dataset[field] is None or isinstance(dataset[field], str): + assert full_index_item[field] == dataset[field] + + assert full_index_item["url_xml"] is None + assert full_index_item["url_zip"] is None + + # -2 because of the two autogenerated fields + assert len(full_index_item.keys()) - 2 == len(get_full_index_source_fields(context)) blob_service_client.close()