Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: unify handling of empty values in index #25

Merged
merged 5 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env-example
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24

REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72

ZIP_WORKING_DIR=/tmp/bulk-data-service-zip

# Sample local setup - values read by docker compose (for simple Postgres DB
# creation), and used by the app
DB_NAME=bulk_data_service_db
Expand Down
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,21 @@ docker compose up

The example `.env` file (`.env-example`) is configured to use the above docker compose setup. If you don't use the docker compose setup, then you will need to change the values in the `.env` file accordingly.

Once the docker compose setup is running, start the bulk download app with:
Once the docker compose setup is running, you can run the dataset updater part of the app with (this will download the datasets and upload them to Azurite):

```
dotenv run python src/iati_bulk_data_service.py -- --operation checker --single-run --run-for-n-datasets=50
```

You can run the zipper operation with:

```
dotenv run python src/iati_bulk_data_service.py -- --operation zipper --single-run
```

It will store the ZIP files in the directory defined in the `ZIP_WORKING_DIR` environment variable.


*Note:* not all versions of `dotenv` require a `run` subcommand.

## Development on the app
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "bulk-data-service"
version = "0.0.1"
version = "0.1.6"
requires-python = ">= 3.12"
readme = "README.md"
dependencies = [
Expand Down
8 changes: 4 additions & 4 deletions src/bulk_data_service/dataset_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ def get_index_entry(context: dict, dataset: dict, index_type: str) -> dict[str,
else:
dataset_index_entry = get_full_index_entry_from_dataset(context, dataset)

dataset_index_entry["url_xml"] = ""
dataset_index_entry["url_zip"] = ""
dataset_index_entry["url_xml"] = None
dataset_index_entry["url_zip"] = None

if dataset_index_entry["last_successful_download"] is not None:
dataset_index_entry["url_xml"] = get_azure_blob_public_url(context, dataset, "xml")
Expand All @@ -80,7 +80,7 @@ def get_minimal_index_entry_from_dataset(context: dict, dataset: dict) -> dict:


def get_full_index_entry_from_dataset(context: dict, dataset: dict) -> dict:
full_index_entry = {k: v for k, v in dataset.items() if k in get_full_index_fields(context)}
full_index_entry = {k: v for k, v in dataset.items() if k in get_full_index_source_fields(context)}

field_from_json_str_to_object(full_index_entry, "download_error_message", "download_error_details")

Expand All @@ -96,7 +96,7 @@ def field_from_json_str_to_object(entry: dict, source_field: str, dest_field: st
del entry[source_field]


def get_full_index_fields(context: dict) -> list[str]:
def get_full_index_source_fields(context: dict) -> list[str]:
return [
"id",
"name",
Expand Down
3 changes: 2 additions & 1 deletion src/bulk_data_service/dataset_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ def check_dataset_etag_last_mod_header(
"return non-200 status. {} "
"HEAD request exception details: {}".format(download_within_hours, extra_err_message, e)
)
} | e.args[0]
}
| e.args[0]
)

context["logger"].warning("dataset id: {} - {}".format(bds_dataset["id"], bds_dataset["head_error_message"]))
Expand Down
2 changes: 1 addition & 1 deletion src/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"AZURE_STORAGE_CONNECTION_STRING",
"AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML",
"AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP",
"CHECKER_LOOP_WAIT_MINS"
"CHECKER_LOOP_WAIT_MINS",
]


Expand Down
92 changes: 86 additions & 6 deletions tests/integration/test_dataset_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
from azure.storage.blob import BlobServiceClient

from bulk_data_service.checker import checker_run
from bulk_data_service.dataset_indexing import get_full_index_fields, get_index_name, get_minimal_index_fields
from bulk_data_service.dataset_indexing import get_full_index_source_fields, get_index_name, get_minimal_index_fields
from helpers.helpers import get_and_clear_up_context # noqa: F401
from utilities.azure import get_azure_container_name


def test_index_creation(get_and_clear_up_context): # noqa: F811
def test_index_uploaded_to_blob_storage(get_and_clear_up_context): # noqa: F811

context = get_and_clear_up_context

Expand All @@ -32,7 +32,7 @@ def test_index_creation(get_and_clear_up_context): # noqa: F811
blob_service_client.close()


def test_minimal_index_creation(get_and_clear_up_context): # noqa: F811
def test_minimal_index_creation_for_download_success(get_and_clear_up_context): # noqa: F811
context = get_and_clear_up_context

context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01"
Expand Down Expand Up @@ -64,7 +64,45 @@ def test_minimal_index_creation(get_and_clear_up_context): # noqa: F811
blob_service_client.close()


def test_full_index_creation(get_and_clear_up_context): # noqa: F811
def test_minimal_index_creation_for_download_failure(get_and_clear_up_context): # noqa: F811
context = get_and_clear_up_context

context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03"
datasets_in_bds = {}
checker_run(context, datasets_in_bds)

blob_service_client = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])

zip_container_name = get_azure_container_name(context, "zip")

minimal_index_name = get_index_name(context, "minimal")

minimal_index_blob = blob_service_client.get_blob_client(zip_container_name, minimal_index_name)

minimal_index = json.loads(minimal_index_blob.download_blob().readall())

dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")]

assert dataset["name"] in minimal_index["datasets"]

index_item = minimal_index["datasets"][dataset["name"]]

for field in get_minimal_index_fields(context):
if isinstance(dataset[field], uuid.UUID):
assert uuid.UUID(index_item[field]) == dataset[field]
elif dataset[field] is None or isinstance(dataset[field], str):
assert index_item[field] == dataset[field]

assert index_item["url_xml"] is None
assert index_item["url_zip"] is None

# -2 because of the two autogenerated fields
assert len(index_item.keys()) - 2 == len(get_minimal_index_fields(context))

blob_service_client.close()


def test_full_index_creation_for_download_success(get_and_clear_up_context): # noqa: F811
context = get_and_clear_up_context

context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01"
Expand All @@ -87,13 +125,55 @@ def test_full_index_creation(get_and_clear_up_context): # noqa: F811

full_index_item = full_index["datasets"][dataset["name"]]

for field in get_full_index_fields(context):
for field in get_full_index_source_fields(context):
if field == "download_error_message" or field == "head_error_message":
continue
if isinstance(dataset[field], uuid.UUID):
assert uuid.UUID(full_index_item[field]) == dataset[field]
elif isinstance(dataset[field], str):
assert full_index_item[field] == dataset[field]

# -2 because of the two autogenerated fields
assert len(full_index_item.keys()) - 2 == len(get_full_index_fields(context))
assert len(full_index_item.keys()) - 2 == len(get_full_index_source_fields(context))

blob_service_client.close()


def test_full_index_creation_for_download_failure(get_and_clear_up_context): # noqa: F811
context = get_and_clear_up_context

context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03"
datasets_in_bds = {}
checker_run(context, datasets_in_bds)

blob_service_client = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])

zip_container_name = get_azure_container_name(context, "zip")

full_index_name = get_index_name(context, "full")

full_index_blob = blob_service_client.get_blob_client(zip_container_name, full_index_name)

full_index = json.loads(full_index_blob.download_blob().readall())

dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")]

assert dataset["name"] in full_index["datasets"]

full_index_item = full_index["datasets"][dataset["name"]]

for field in get_full_index_source_fields(context):
if field == "download_error_message" or field == "head_error_message":
continue
if isinstance(dataset[field], uuid.UUID):
assert uuid.UUID(full_index_item[field]) == dataset[field]
elif dataset[field] is None or isinstance(dataset[field], str):
assert full_index_item[field] == dataset[field]

assert full_index_item["url_xml"] is None
assert full_index_item["url_zip"] is None

# -2 because of the two autogenerated fields
assert len(full_index_item.keys()) - 2 == len(get_full_index_source_fields(context))

blob_service_client.close()