Links
+IATI Dataset Index - Minimal (JSON)
+ + +diff --git a/.env-example b/.env-example
new file mode 100644
index 0000000..8b8bfb5
--- /dev/null
+++ b/.env-example
@@ -0,0 +1,29 @@
+DATA_REGISTRATION=ckan-registry
+DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
+
+BLOB_STORAGE_BASE_PUBLIC_URL=http://127.0.0.1:10000/devstoreaccount1
+
+NUMBER_DOWNLOADER_THREADS=1 # makes for easier testing locally
+
+FORCE_REDOWNLOAD_AFTER_HOURS=24
+
+REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
+
+# Log file
+LOGFILE=
+
+# Sample local setup - values read by docker compose (for simple Postgres DB
+# creation), and used by the app
+DB_NAME=bulk_data_service_db
+DB_USER=bds
+DB_PASS=pass
+DB_HOST=localhost
+DB_PORT=5255
+DB_SSL_MODE=disable
+DB_CONNECTION_TIMEOUT=30
+
+# Local Azurite Emulator
+AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;
+
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip
diff --git a/.github/workflows/build-and-deploy-job.yml b/.github/workflows/build-and-deploy-job.yml
new file mode 100644
index 0000000..bda631a
--- /dev/null
+++ b/.github/workflows/build-and-deploy-job.yml
@@ -0,0 +1,123 @@
+name: Generic build and deploy (called by other workflows)
+
+on:
+ workflow_call:
+ inputs:
+ APP_NAME:
+ required: true
+ type: string
+ TARGET_ENVIRONMENT:
+ required: true
+ type: string
+
+
+jobs:
+ build-and-deploy:
+ runs-on: ubuntu-latest
+ env:
+ APP_NAME: ${{ inputs.APP_NAME }}
+ TARGET_ENVIRONMENT: ${{ inputs.TARGET_ENVIRONMENT }}
+
+ DOCKER_IMAGE_TAG: ${{ github.sha }}
+
+ # Needed as an environment variable for use of 'az' cmd in inline shell script
+ ACR_LOGIN_SERVER: ${{ secrets.ACR_LOGIN_SERVER }}
+ ACR_USERNAME: ${{ secrets.ACR_USERNAME }}
+ ACR_PASSWORD: ${{ secrets.ACR_PASSWORD }}
+
+ steps:
+ - name: 'Generate/build derived environment variables'
+ run: |
+ echo "TARGET_ENVIRONMENT_UPPER=${TARGET_ENVIRONMENT^^}" >> ${GITHUB_ENV}
+ echo "CONTAINER_INSTANCE_BASE_NAME=aci-${APP_NAME}" >> ${GITHUB_ENV}
+ echo "RESOURCE_GROUP_BASE_NAME=rg-${APP_NAME}" >> ${GITHUB_ENV}
+
+ - name: 'Print calculated environment variables'
+ run: |
+ echo $TARGET_ENVIRONMENT_UPPER
+ echo $CONTAINER_INSTANCE_BASE_NAME
+ echo $RESOURCE_GROUP_BASE_NAME
+
+ - name: 'Checkout GitHub Action'
+ uses: actions/checkout@v4
+
+ - name: 'Login via Azure CLI'
+ uses: azure/login@v2
+ with:
+ creds: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_CREDENTIALS')] }}
+
+ - name: 'Login to Docker Hub'
+ uses: docker/login-action@v3.2.0
+ with:
+ username: ${{ secrets.DOCKER_HUB_USERNAME }}
+ password: ${{ secrets.DOCKER_HUB_TOKEN }}
+
+ - name: 'Login to Azure Container Registry'
+ uses: azure/docker-login@v2
+ with:
+ login-server: ${{ env.ACR_LOGIN_SERVER }}
+ username: ${{ env.ACR_USERNAME }}
+ password: ${{ env.ACR_PASSWORD }}
+
+ - name: 'Build and push image'
+ run: |
+ IMAGE_NAME=$ACR_LOGIN_SERVER/$APP_NAME-$TARGET_ENVIRONMENT:$DOCKER_IMAGE_TAG
+ echo "IMAGE_NAME=$IMAGE_NAME" >> $GITHUB_ENV
+ docker build . -f Dockerfile -t $IMAGE_NAME
+ docker push $IMAGE_NAME
+
+ - name: 'Print IMAGE_NAME for Bulk Data Service image'
+ run: echo $IMAGE_NAME
+
+ - name: 'Create htpasswd file for nginx reverse proxy'
+ run: |
+ htpasswd -c -b ./azure-deployment/nginx-reverse-proxy/htpasswd prom "${{ secrets.PROM_NGINX_REVERSE_PROXY_PASSWORD }}"
+ docker build ./azure-deployment/nginx-reverse-proxy -t criati.azurecr.io/bds-prom-nginx-reverse-proxy-$TARGET_ENVIRONMENT:$DOCKER_IMAGE_TAG
+ docker push criati.azurecr.io/bds-prom-nginx-reverse-proxy-$TARGET_ENVIRONMENT:$DOCKER_IMAGE_TAG
+
+ - name: 'Delete existing container group'
+ uses: 'azure/CLI@v2'
+ with:
+ inlineScript: |
+ az -v
+ az container delete -y \
+ --name "${{ env.CONTAINER_INSTANCE_BASE_NAME }}-${{ env.TARGET_ENVIRONMENT }}" \
+ --resource-group "${{ env.RESOURCE_GROUP_BASE_NAME }}-${{ env.TARGET_ENVIRONMENT }}"
+
+ - name: 'Replace Env Vars and Secrets in ARM Yaml template'
+ env:
+ # Credentials for the app's resources
+ AZURE_STORAGE_CONNECTION_STRING: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_CONNECTION_STRING')] }}
+
+ DB_HOST: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_HOST')] }}
+ DB_USER: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_USER')] }}
+ DB_PASS: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_PASS')] }}
+ DB_NAME: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_NAME')] }}
+ DB_PORT: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_PORT')] }}
+ DB_SSL_MODE: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_SSL_MODE')] }}
+ DB_CONNECTION_TIMEOUT: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_CONNECTION_TIMEOUT')] }}
+
+ LOG_WORKSPACE_ID: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'LOG_WORKSPACE_ID')] }}
+ LOG_WORKSPACE_KEY: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'LOG_WORKSPACE_KEY')] }}
+
+ # Variables which configure the app
+ DATA_REGISTRATION: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRATION')] }}
+ DATA_REGISTRY_BASE_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_BASE_URL')] }}
+ NUMBER_DOWNLOADER_THREADS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'NUMBER_DOWNLOADER_THREADS')] }}
+ FORCE_REDOWNLOAD_AFTER_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'FORCE_REDOWNLOAD_AFTER_HOURS')] }}
+ REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS')] }}
+ ZIP_WORKING_DIR: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'ZIP_WORKING_DIR')] }}
+ AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML')] }}
+ AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP')] }}
+
+ run: |
+ ./azure-deployment/generate-manifest-from-template.sh
+
+ - name: 'Deploy group to Azure Container Instances'
+ uses: 'azure/CLI@v2'
+ with:
+ inlineScript: |
+ az -v
+ az container create --debug \
+ --resource-group "${{ env.RESOURCE_GROUP_BASE_NAME }}-${{ env.TARGET_ENVIRONMENT }}" \
+ --file ./azure-deployment/azure-resource-manager-deployment-manifest.yml
diff --git a/.github/workflows/deploy-to-dev.yml b/.github/workflows/deploy-to-dev.yml
new file mode 100644
index 0000000..9e1d962
--- /dev/null
+++ b/.github/workflows/deploy-to-dev.yml
@@ -0,0 +1,22 @@
+name: Deploy Bulk Data Service to dev
+
+
+on:
+ workflow_dispatch:
+ push:
+ paths-ignore:
+ - '.github/workflows/deploy-to-prod.yml'
+ branches:
+ - develop
+
+
+jobs:
+ run-tests:
+ uses: ./.github/workflows/test.yml
+ call-build-and-deploy:
+ needs: run-tests
+ uses: ./.github/workflows/build-and-deploy-job.yml
+ secrets: inherit
+ with:
+ APP_NAME: "bulk-data-service"
+ TARGET_ENVIRONMENT: "test"
diff --git a/.github/workflows/deploy-to-prod.yml b/.github/workflows/deploy-to-prod.yml
new file mode 100644
index 0000000..aee507a
--- /dev/null
+++ b/.github/workflows/deploy-to-prod.yml
@@ -0,0 +1,16 @@
+name: Deploy Bulk Data Service to production
+
+
+on:
+ workflow_dispatch:
+ release:
+ types: [published]
+
+
+jobs:
+ call-build-and-deploy:
+ uses: ./.github/workflows/build-and-deploy-job.yml
+ secrets: inherit
+ with:
+ APP_NAME: "bulk-data-service"
+ TARGET_ENVIRONMENT: "test"
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..97113bf
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,31 @@
+name: Run Automated Tests
+
+on:
+ workflow_call:
+ workflow_dispatch:
+ push:
+ branches:
+ - '**'
+ - '!develop'
+ - '!main'
+
+jobs:
+ run-tests:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Setup python
+ uses: actions/setup-python@v5
+ with:
+ python-version: 3.12
+ architecture: x64
+
+ - name: Install requirements-dev.txt
+ run: pip install -r requirements-dev.txt
+
+ - name: Run docker-compose
+ run: cd ./tests-local-environment; docker compose up -d
+
+ - name: Run automated tests
+ run: pytest
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a8aa899
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,17 @@
+__pycache__
+*.py[cod]
+
+/.actrc
+
+.mypy_cache
+.pytest_cache
+.ve
+
+/.env
+
+/azure-deployment/azure-resource-manager-deployment-manifest.yml
+/azure-deployment/manual-azure-deploy-secrets.env
+/azure-deployment/manual-azure-deploy-variables.env
+/azure-deployment/nginx-reverse-proxy/htpasswd
+
+/web/index.html
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..4a6ec4f
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,21 @@
+{
+ // Use IntelliSense to learn about possible attributes.
+ // Hover to view descriptions of existing attributes.
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+ "version": "0.2.0",
+ "configurations": [
+ {
+ "name": "Python Debugger: Bulk Data Service - Checker - Single Run",
+ "type": "debugpy",
+ "request": "launch",
+ "program": "src/iati_bulk_data_service.py",
+ "args": [
+ "--operation",
+ "checker",
+ "--single-run"
+ ],
+ "console": "integratedTerminal",
+ "envFile": "${workspaceFolder}/.env"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..bad5e3b
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,31 @@
+{
+ "python.testing.pytestArgs": [
+ "tests"
+ ],
+ "python.testing.unittestEnabled": false,
+ "python.testing.pytestEnabled": true,
+ "files.trimTrailingWhitespace": true,
+ "python.analysis.typeCheckingMode": "basic",
+ "editor.formatOnSave": true,
+ "[python]": {
+ "editor.formatOnSave": true
+ },
+ "editor.codeActionsOnSave": {
+ "source.organizeImports": "always"
+ },
+ "isort.args": [
+ "--profile",
+ "black",
+ "--py",
+ "312"
+ ],
+ "isort.path": [
+ "isort"
+ ],
+ "isort.interpreter": [
+ "python"
+ ],
+ "mypy.targets": [
+ "src/"
+ ],
+}
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..3b702c2
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.12-slim-bookworm
+
+RUN apt-get update -y
+
+WORKDIR /bulk-data-service
+
+COPY requirements.txt .
+
+RUN pip install -r requirements.txt
+
+COPY src/ src
+COPY db-migrations/ db-migrations
+
+ENTRYPOINT ["/usr/local/bin/python", "src/iati_bulk_data_service.py"]
diff --git a/README.md b/README.md
index 8b13789..c2da6c5 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,232 @@
+# IATI Bulk Data Service Tool
+
+## Summary
+
+ Product | IATI Bulk Data Service
+--- | ---
+Description | A Python application which fetches the list of registered IATI datasets and periodically downloads them, making each available individually as an XML file and ZIP file, and also providing a ZIP file containing all the datasets.
+Website | None
+Related |
+Documentation | Rest of README.md
+Technical Issues | See https://github.com/IATI/bulk-data-service/issues
+Support | https://iatistandard.org/en/guidance/get-support/
+
+## High-level requirements
+
+* Python 3.12
+* Postgres DB
+* Azure storage account with blob storage enabled
+
+## Running the app locally
+
+### First-time setup
+
+#### 1. Setup and activate a Python virtual environment.
+
+```
+python3.12 -m venv .ve
+source .ve/bin/activate
+```
+
+#### 2. Install the dependencies
+
+```
+pip install -r requirements.txt
+```
+
+#### 3. Setup a `.env` file
+
+The IATI Bulk Data Service app, the docker compose setup for local development (Azurite, Postgres), and the yoyo database migrations tool (which the Bulk Data Service app runs, but which it is sometimes useful to run from the command line during development), are all configured via environment variables. When running locally, these are set via a `.env` file. To create one, copy the example file and edit as needed:
+
+```
+cp .env-example .env
+```
+
+The example file is preconfigured to work with the local docker compose setup.
+
+#### 4. Install some version of `dotenv` (optional)
+
+The `.env` file is used when running things locally to store environment variables that configure the apps mentioned above. Docker Compose will read this automatically, but when running the bulk data service app or `yoyo` directly, you need to get these variables into the shell environment: you can either source this file to get the environment variables into your current terminal context, or you can one of the various `dotenv` command line tools to import the environment on each run (using `dotenv` lets you quickly switch different `.env` files in and out, which can be useful for testing, debugging, etc).
+
+### Running after first-time setup
+
+Running the app successfully requires a Postgres database and a connection to an Azure blob storage account. There is a docker compose setup which can be used to start an instance of each service locally, that can be run with:
+
+```
+docker compose up
+```
+
+The example `.env` file (`.env-example`) is configured to use the above docker compose setup. If you don't use the docker compose setup, then you will need to change the values in the `.env` file accordingly.
+
+Once the docker compose setup is running, start the bulk download app with:
+
+```
+dotenv run python src/iati_bulk_data_service.py -- --operation checker --single-run --run-for-n-datasets=50
+```
+
+*Note:* not all versions of `dotenv` require a `run` subcommand.
+
+## Development on the app
+
+### Code checking and formatting
+
+The project is set up with various code linters and formatters. You can setup your IDE to run them automatically on file save, or you can run them manually. (Configuration
+files are included for VS Code).
+
+To run these you need to install the extra development dependencies into the Python virtual environment using the following:
+
+```
+pip install -r requirements-dev.txt
+```
+
+#### isort
+
+Import sorter `isort` is configured via `pyproject.toml` and can be run with:
+
+```
+isort .
+```
+
+#### mypy
+
+Type checker `mypy` is configured via `pyproject.toml`. It can be run with:
+
+```
+mypy
+```
+
+#### flake8
+
+Flake8 is configured via `pyproject.toml`, and can be run with:
+
+```
+flake8
+```
+
+#### black
+
+Code formatter `black` is configured via `pyproject.toml` and can be run with:
+
+```
+black .
+```
+
+
+### Adding new dependencies to main project
+
+New dependencies need to be added to `pyproject.toml`.
+
+After new dependencies have been added, `requirements.txt` should be regenerated using:
+
+```
+pip-compile --upgrade -o requirements.txt pyproject.toml
+```
+
+### Adding new dependencies to the development environment
+
+New development dependencies need to be added to `pyproject.toml` in the `dev` value of the `[project.optional-dependencies]` section.
+
+After new dev dependencies have been added, `requirements-dev.txt` should be regenerated using:
+
+```
+pip-compile --upgrade --extra dev -o requirements-dev.txt pyproject.toml
+```
+
+### Database migrations
+
+The Bulk Data Service's database schema management is handled by [yoyo](https://ollycope.com/software/yoyo/latest/). The database is created and migrated (if needed) whenever the app is run, so during development, it is always safe to drop the database if you want to start over.
+
+`yoyo` has a command line tool which can be used to do this, and which can also be used to rollback the database schema to any particula revision, if that is useful during development.
+
+`yoyo` is configured via `yoyo.ini` which draws values from environment variables, and so it is best run using `dotenv` which will configure it for whatever local setup you are using:
+
+The following commands may be useful:
+
+```
+dotenv run yoyo -- list # list available migrations
+dotenv run yoyo -- rollback # rollback, interactively
+dotenv run yoyo -- new # create file for a new migration
+```
+
+
+### Automated tests
+
+Requirements: docker compose
+
+There are some unit and integration tests written in `pytest`. The integration tests work by running various bits of the code against running servers, and there is a docker compose setup which launches: Azurite, Postgres, and a Mockoon server. The Azurite and Postgres services are ephemeral, and don't persist any data to disk. The Mockoon server serves some of the artifacts in `tests/artifacts` over HTTP, and has some routes configured to return error codes so these can be tested
+
+To run the tests, you must first start this docker compose setup with:
+
+```
+cd tests-automated-environment
+docker compose up --remove-orphans
+```
+
+Note: the `--remove-orphans` just helps keep things clean as you develop, and alter the setup.
+
+Once this is running, run the tests with:
+
+```
+pytest
+```
+
+This automated test environment is configured via the following files:
+
+`tests-local-environment/.env`
+
+`tests-local-environment/docker-compose.yml`
+
+`tests-local-environment/mockoon-registration-and-data-server-config.json`
+
+You can use the Mockoon GUI application to edit the mockoon server configuration file (`mockoon-registration-and-data-server-config.json`).
+
+## Provisioning and Deployment
+
+### Initial Provisioning
+
+You can create an Azure-based instance of Bulk Data Service using the `azure-create-resources.sh` script. It must be run from the root of the repository, and it requires (i) the environment variable `BDS_DB_ADMIN_PASSWORD` to be set with the password for the database, and (ii) a single parameter which is the name of the environment/instance. For instance, the following command will create a dev instance:
+
+```bash
+BDS_DB_ADMIN_PASSWORD=passwordHere ./azure-provision/azure-create-resources.sh dev`
+```
+
+This will create a resource group on Azure called `rg-bulk-data-service-dev`, and then create and configure all the Azure resources needed for the Bulk Data Service within that resource group (except for the Container Instance, which is created/updated as part of the deploy stage).
+
+At the end of its run, the `azure-create-resources.sh` script will print out various secrets which need to be added to Github Actions.
+
+### Deployment - CI/CD
+
+The application is setup to deploy to the dev instance when a PR is merged to
+ `develop`, and to production when a release is done on `main` branch.
+
+ Sometimes, when altering the CI/CD setup or otherwise debugging, it can be
+ useful to do things manually. The Bulk Data Service can be released to an Azure instance (e.g., a test instance) using the following command:
+
+ ```bash
+./azure-deployment/manual-azure-deploy-from-local.sh test
+```
+
+For this to work, you need to put the secrets you want to use in `azure-deployment/manual-azure-deploy-secrets.env` and the variables you want to use in `azure-deployment/manual-azure-deploy-variables.env`. These is an example of each of these files that can be used as a starting point.
+
+
+### Manually building the docker image (to test/develop the deployment setup)
+
+You can build the docker image using the following command, replacing `INSTANCE_NAME` with the relevant instance:
+
+```bash
+docker build . -t criati.azurecr.io/bulk-data-service-INSTANCE_NAME
+```
+
+To run it locally:
+
+```bash
+docker container run --env-file=.env-docker "criati.azurecr.io/bulk-data-service-dev" --operation checker --single-run --run-for-n-datasets 20
+```
+
+
+## Resources
+
+[Reference docs for the Azure deployment YAML file](https://learn.microsoft.com/en-us/azure/container-instances/container-instances-reference-yaml#schema) (`azure-deployment/deploy.yml`).
+
+
diff --git a/azure-deployment/azure-resource-manager-deployment-template.yml b/azure-deployment/azure-resource-manager-deployment-template.yml
new file mode 100644
index 0000000..119970f
--- /dev/null
+++ b/azure-deployment/azure-resource-manager-deployment-template.yml
@@ -0,0 +1,82 @@
+name: "aci-#APP_NAME#-#TARGET_ENVIRONMENT#"
+apiVersion: "2021-10-01"
+location: "uksouth"
+properties: # Properties of container group
+ imageRegistryCredentials: # Credentials to pull a private image
+ - server: "#ACR_LOGIN_SERVER#"
+ username: "#ACR_USERNAME#"
+ password: "#ACR_PASSWORD#"
+ restartPolicy: "Never"
+ osType: "Linux"
+ diagnostics:
+ logAnalytics:
+ workspaceId: "#LOG_WORKSPACE_ID#"
+ workspaceKey: "#LOG_WORKSPACE_KEY#"
+ containers:
+ - name: "#APP_NAME#-#TARGET_ENVIRONMENT#"
+ properties: # Properties of an instance
+ resources: # Resource requirements of the instance
+ requests:
+ memoryInGB: 4
+ cpu: 1
+ image: "#ACR_LOGIN_SERVER#/#APP_NAME#-#TARGET_ENVIRONMENT#:#DOCKER_IMAGE_TAG#"
+ ports:
+ - port: 9090
+ command:
+ - "/usr/local/bin/python"
+ - "src/iati_bulk_data_service.py"
+ - "--operation"
+ - "checker"
+ environmentVariables:
+ - name: DATA_REGISTRATION
+ value: "#DATA_REGISTRATION#"
+ - name: DATA_REGISTRY_BASE_URL
+ value: "#DATA_REGISTRY_BASE_URL#"
+ - name: BLOB_STORAGE_BASE_PUBLIC_URL
+ value: "https://sabulkdataservice#TARGET_ENVIRONMENT#.blob.core.windows.net"
+ - name: NUMBER_DOWNLOADER_THREADS
+ value: "#NUMBER_DOWNLOADER_THREADS#"
+ - name: FORCE_REDOWNLOAD_AFTER_HOURS
+ value: "#FORCE_REDOWNLOAD_AFTER_HOURS#"
+ - name: REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS
+ value: "#REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS#"
+ - name: LOGFILE
+ value: ""
+ - name: ZIP_WORKING_DIR
+ value: "#ZIP_WORKING_DIR#"
+ - name: AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML
+ value: "#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML#"
+ - name: AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP
+ value: "#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP#"
+
+ - name: AZURE_STORAGE_CONNECTION_STRING
+ secureValue: "#AZURE_STORAGE_CONNECTION_STRING#"
+ - name: DB_HOST
+ secureValue: "#APP_NAME#-db-#TARGET_ENVIRONMENT#.postgres.database.azure.com"
+ - name: DB_PORT
+ secureValue: "#DB_PORT#"
+ - name: DB_USER
+ secureValue: "#DB_USER#"
+ - name: DB_PASS
+ secureValue: "#DB_PASS#"
+ - name: DB_NAME
+ secureValue: "#DB_NAME#"
+ - name: DB_SSL_MODE
+ secureValue: "#DB_SSL_MODE#"
+ - name: DB_CONNECTION_TIMEOUT
+ secureValue: "#DB_CONNECTION_TIMEOUT#"
+ - name: nginx-proxy-for-prometheus
+ properties:
+ image: "#ACR_LOGIN_SERVER#/bds-prom-nginx-reverse-proxy-#TARGET_ENVIRONMENT#:#DOCKER_IMAGE_TAG#"
+ ports:
+ - port: 9158
+ protocol: TCP
+ resources:
+ requests:
+ cpu: 1.0
+ memoryInGB: 0.5
+ ipAddress:
+ type: "public"
+ dnsNameLabel: "#APP_NAME#-#TARGET_ENVIRONMENT#"
+ ports:
+ - port: 9158
diff --git a/azure-deployment/generate-manifest-from-template.sh b/azure-deployment/generate-manifest-from-template.sh
new file mode 100755
index 0000000..505e945
--- /dev/null
+++ b/azure-deployment/generate-manifest-from-template.sh
@@ -0,0 +1,51 @@
+#!/bin/env bash
+
+# This script is not intended to be run directly.
+# When doing a manual Azure deploy from a local machine (for testing, debugging, etc)
+# this script will be run by 'manual-azure-deploy-from-local.sh'; it is also run
+# by the generic 'build-and-deploy' Github action
+
+if [ "$LOCAL_DEPLOY" == "true" ]; then
+ echo "Deploying from local environment..."
+ source ./azure-deployment/manual-azure-deploy-secrets.env
+ source ./azure-deployment/manual-azure-deploy-variables.env
+fi
+
+# Copy the template to the manifest
+
+cp -f ./azure-deployment/azure-resource-manager-deployment-template.yml ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+
+# Variables which configure dependent services
+
+sed -i "s^#APP_NAME#^$APP_NAME^g" ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i "s^#TARGET_ENVIRONMENT#^$TARGET_ENVIRONMENT^g" ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i "s^#DOCKER_IMAGE_TAG#^$DOCKER_IMAGE_TAG^g" ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+
+sed -i ''s^#ACR_LOGIN_SERVER#^$ACR_LOGIN_SERVER^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#ACR_USERNAME#^$ACR_USERNAME^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#ACR_PASSWORD#^$ACR_PASSWORD^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+
+sed -i ''s^#LOG_WORKSPACE_ID#^$LOG_WORKSPACE_ID^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#LOG_WORKSPACE_KEY#^$LOG_WORKSPACE_KEY^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+
+sed -i ''s^#AZURE_STORAGE_CONNECTION_STRING#^$AZURE_STORAGE_CONNECTION_STRING^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+
+sed -i ''s^#DB_HOST#^$DB_HOST^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#DB_PORT#^$DB_PORT^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#DB_USER#^$DB_USER^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#DB_PASS#^$DB_PASS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#DB_NAME#^$DB_NAME^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#DB_SSL_MODE#^$DB_SSL_MODE^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#DB_CONNECTION_TIMEOUT#^$DB_CONNECTION_TIMEOUT^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+
+
+# Variables which configure the behaviour of the Bulk Data Service
+
+sed -i ''s^#DATA_REGISTRATION#^$DATA_REGISTRATION^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#DATA_REGISTRY_BASE_URL#^$DATA_REGISTRY_BASE_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#NUMBER_DOWNLOADER_THREADS#^$NUMBER_DOWNLOADER_THREADS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#FORCE_REDOWNLOAD_AFTER_HOURS#^$FORCE_REDOWNLOAD_AFTER_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS#^$REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#ZIP_WORKING_DIR#^$ZIP_WORKING_DIR^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
diff --git a/azure-deployment/manual-azure-deploy-from-local.sh b/azure-deployment/manual-azure-deploy-from-local.sh
new file mode 100755
index 0000000..481e605
--- /dev/null
+++ b/azure-deployment/manual-azure-deploy-from-local.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+set -uo pipefail
+
+if [ ! -v "1" ]; then
+ echo "usage: $0 TARGET_ENVIRONMENT"
+ echo " TARGET_ENVIRONMENT should likely be 'test', 'dev', or 'prod'"
+ exit 1
+fi
+
+if [ ! -d ".git" ]; then
+ echo "$0: script must be run from the root of the bulk-data-service repository"
+ exit 1
+fi
+
+git remote -v | grep "IATI/bulk-data-service.git" > /dev/null
+
+if [ "$?" != 0 ]; then
+ echo "$0: script must be run from the root of the bulk-data-service repository"
+ exit 1
+fi
+
+. ./manual-azure-deploy-secrets.env
+
+TARGET_ENVIRONMENT=$1
+
+APP_NAME=bulk-data-service
+
+RESOURCE_GROUP_NAME=rg-${APP_NAME}-${TARGET_ENVIRONMENT}
+
+CONTAINER_GROUP_INSTANCE_NAME=aci-${APP_NAME}-${TARGET_ENVIRONMENT}
+
+LOCAL_DEPLOY=true
+
+echo "Generating Azure ARM deployment manifest from template"
+. ./azure-deployment/generate-manifest-from-template.sh
+
+# build the docker image for the Bulk Data Service
+docker build . -t criati.azurecr.io/bulk-data-service-$TARGET_ENVIRONMENT
+
+# push Bulk Data Service image to Azure
+docker push criati.azurecr.io/bulk-data-service-$TARGET_ENVIRONMENT
+
+# now configure, build and push the docker image for the nginx reverse proxy
+
+# create password file
+htpasswd -c -b ./azure-deployment/nginx-reverse-proxy/htpasswd prom $PROM_NGINX_REVERSE_PROXY_PASSWORD
+
+# make the image for the nginx reverse proxy (for putting HTTP basic auth on the
+# prom client)
+docker build ./azure-deployment/nginx-reverse-proxy -t criati.azurecr.io/bds-prom-nginx-reverse-proxy-$TARGET_ENVIRONMENT
+
+docker push criati.azurecr.io/bds-prom-nginx-reverse-proxy-$TARGET_ENVIRONMENT
+
+
+echo az container delete \
+ --resource-group "$RESOURCE_GROUP_NAME" \
+ --name "$CONTAINER_GROUP_INSTANCE_NAME"
+az container delete \
+ --resource-group "$RESOURCE_GROUP_NAME" \
+ --name "$CONTAINER_GROUP_INSTANCE_NAME"
+
+echo az container create \
+ --resource-group "$RESOURCE_GROUP_NAME" \
+ --file ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+az container create \
+ --resource-group "$RESOURCE_GROUP_NAME" \
+ --file ./azure-deployment/azure-resource-manager-deployment-manifest.yml
diff --git a/azure-deployment/manual-azure-deploy-secrets-example.env b/azure-deployment/manual-azure-deploy-secrets-example.env
new file mode 100644
index 0000000..fbe3ba4
--- /dev/null
+++ b/azure-deployment/manual-azure-deploy-secrets-example.env
@@ -0,0 +1,24 @@
+# This file is used when doing a manual Azure deploy from a local machine. It should
+# contain the equivalent of the secrets that are stored in Github actions
+
+ACR_LOGIN_SERVER=
+ACR_USERNAME=
+ACR_PASSWORD=
+
+DOCKER_HUB_USERNAME=
+DOCKER_HUB_TOKEN=
+
+AZURE_STORAGE_CONNECTION_STRING=
+
+LOG_WORKSPACE_ID=
+LOG_WORKSPACE_KEY=
+
+DB_USER=
+DB_PASS=
+DB_HOST=
+DB_PORT=
+DB_NAME=
+DB_SSL_MODE=require
+DB_CONNECTION_TIMEOUT=30
+
+PROM_NGINX_REVERSE_PROXY_PASSWORD=
diff --git a/azure-deployment/manual-azure-deploy-variables-example.env b/azure-deployment/manual-azure-deploy-variables-example.env
new file mode 100644
index 0000000..745b4db
--- /dev/null
+++ b/azure-deployment/manual-azure-deploy-variables-example.env
@@ -0,0 +1,22 @@
+# This file is used when doing a manual Azure deploy from a local machine. It should
+# contain the equivalent of the variables that are stored in Github actions
+
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip
+
+# Value of BLOB_STORAGE_BASE_PUBLIC_URL generated automatically by deploy scripts
+# BLOB_STORAGE_BASE_PUBLIC_URL=
+
+DATA_REGISTRATION=ckan-registry
+DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
+
+FORCE_REDOWNLOAD_AFTER_HOURS=24
+
+# Log file
+LOGFILE=
+
+NUMBER_DOWNLOADER_THREADS=25
+
+REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
+
+ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
diff --git a/azure-deployment/nginx-reverse-proxy/Dockerfile b/azure-deployment/nginx-reverse-proxy/Dockerfile
new file mode 100644
index 0000000..b16441a
--- /dev/null
+++ b/azure-deployment/nginx-reverse-proxy/Dockerfile
@@ -0,0 +1,3 @@
+FROM nginx
+COPY nginx.conf /etc/nginx/nginx.conf
+COPY htpasswd /etc/nginx/htpasswd
diff --git a/azure-deployment/nginx-reverse-proxy/nginx.conf b/azure-deployment/nginx-reverse-proxy/nginx.conf
new file mode 100644
index 0000000..0605f24
--- /dev/null
+++ b/azure-deployment/nginx-reverse-proxy/nginx.conf
@@ -0,0 +1,46 @@
+
+user nginx;
+worker_processes auto;
+
+error_log /var/log/nginx/error.log notice;
+pid /var/run/nginx.pid;
+
+
+events {
+ worker_connections 1024;
+}
+
+
+http {
+ default_type application/octet-stream;
+
+ log_format main '$remote_addr - $remote_user [$time_local] "$request" '
+ '$status $body_bytes_sent "$http_referer" '
+ '"$http_user_agent" "$http_x_forwarded_for"';
+
+ keepalive_timeout 65;
+
+ gzip on;
+
+ include /etc/nginx/conf.d/*.conf;
+
+ server {
+ listen 9158 default_server;
+ listen [::]:9158 default_server;
+
+ root /var/www/html;
+
+ server_name _;
+
+ location / {
+ proxy_pass http://localhost:9090;
+ }
+
+ auth_basic "IATI Bulk Data Service Metrics Exporter";
+ auth_basic_user_file htpasswd;
+
+ access_log /var/log/nginx/prometheus-bulk-data-service-exporter-access.log;
+ error_log /var/log/nginx/prometheus-bulk-data-service-exporter-error.log;
+ }
+}
+
diff --git a/azure-provision/azure-create-resources.sh b/azure-provision/azure-create-resources.sh
new file mode 100755
index 0000000..80514b6
--- /dev/null
+++ b/azure-provision/azure-create-resources.sh
@@ -0,0 +1,239 @@
+#!/usr/bin/env bash
+
+# This script sets up and configures the Azure resources that are needed for a given
+# installation of the Bulk Data Service app:
+# resource group, log analytics workspace, storage account, postgres database.
+
+set -euo pipefail
+
+if [ ! -v "1" ]; then
+ echo "usage: $0 TARGET_ENVIRONMENT"
+ echo " TARGET_ENVIRONMENT should likely be 'test', 'dev', or 'prod'"
+ exit 1
+fi
+
+if [ "$1" == "" ]; then
+ echo "TARGET_ENVIRONMENT cannot be empty"
+ exit 2
+fi
+
+if [ ! -v "BDS_DB_ADMIN_PASSWORD" ] || [ "$BDS_DB_ADMIN_PASSWORD" == "" ]; then
+ echo "The environment variable BDS_DB_ADMIN_PASSWORD must be set"
+ exit 2
+fi
+
+if [ $(which jq > /dev/null) > 0 ]; then
+ echo "This script requires the tool 'jq' to be installed"
+ exit 3
+fi
+
+TARGET_ENVIRONMENT_ENV_VAR=$(echo "$1" | tr '[:lower:]' '[:upper:]')
+
+SUBSCRIPTION_ID=$(az account list | jq -r '.[0].id')
+
+APP_NAME=bulk-data-service
+
+APP_NAME_NO_HYPHENS=$(echo $APP_NAME | sed -e 's/-//g')
+
+RESOURCE_GROUP_NAME=rg-${APP_NAME}-$1
+
+LOG_ANALYTICS_NAME=log-${APP_NAME}-$1
+
+STORAGE_ACCOUNT_NAME=sa${APP_NAME_NO_HYPHENS}$1
+
+POSTGRES_SERVER_NAME=${APP_NAME}-db-$1
+
+SERVICE_PRINCIPAL_NAME=sp-${APP_NAME}-$1
+
+LOCATION=uksouth
+
+echo
+echo "Proceeding will create Azure services with the following names:"
+echo
+echo "App base name : $APP_NAME"
+echo "Resource group name : $RESOURCE_GROUP_NAME"
+echo "Log analytics workspace name : $LOG_ANALYTICS_NAME"
+echo "Storage account name : $STORAGE_ACCOUNT_NAME"
+echo "Postgres server name : $POSTGRES_SERVER_NAME"
+echo "Service principal name : $SERVICE_PRINCIPAL_NAME"
+echo
+echo
+echo "(Using subscription: $SUBSCRIPTION_ID)"
+echo
+echo
+
+read -p "Do you want to continue? ([y]es or [n]o) " -n 1 -r
+echo ""
+
+if [[ $(echo $REPLY | tr '[A-Z]' '[a-z]') != "y" ]];
+then
+ echo "User exited"
+ exit 4
+fi
+
+# Create Resource Group
+echo az group create --name $RESOURCE_GROUP_NAME --location $LOCATION
+az group create --name $RESOURCE_GROUP_NAME --location $LOCATION
+echo
+
+
+# Create Log Analytics Workspace
+echo az monitor log-analytics workspace create --resource-group $RESOURCE_GROUP_NAME \
+ --workspace-name $LOG_ANALYTICS_NAME
+LOG_ANALYTICS_CREATE_OUTPUT=$(az monitor log-analytics workspace create --resource-group $RESOURCE_GROUP_NAME \
+ --workspace-name $LOG_ANALYTICS_NAME)
+
+echo "LOG_ANALYTICS_WORKSPACE_ID=echo ${LOG_ANALYTICS_CREATE_OUTPUT//[$'\t\r\n ']} | jq -r '.customerId'"
+
+LOG_ANALYTICS_WORKSPACE_ID=$(echo "${LOG_ANALYTICS_CREATE_OUTPUT//[$'\t\r\n ']}" | jq -r '.customerId')
+
+echo Workspace ID is: $LOG_ANALYTICS_WORKSPACE_ID
+
+echo az monitor log-analytics workspace get-shared-keys \
+ -g $RESOURCE_GROUP_NAME \
+ -n $LOG_ANALYTICS_NAME \| jq -r '.primarySharedKey'
+
+LOG_ANALYTICS_WORKSPACE_KEY=$(az monitor log-analytics workspace get-shared-keys -g $RESOURCE_GROUP_NAME -n $LOG_ANALYTICS_NAME | jq -r '.primarySharedKey')
+
+echo Workspace key is: $LOG_ANALYTICS_WORKSPACE_KEY
+
+# Create storage account
+echo az storage account create --resource-group $RESOURCE_GROUP_NAME \
+ --name $STORAGE_ACCOUNT_NAME \
+ --location $LOCATION \
+ --sku Standard_LRS \
+ --enable-hierarchical-namespace true \
+ --kind StorageV2
+az storage account create --resource-group $RESOURCE_GROUP_NAME \
+ --name $STORAGE_ACCOUNT_NAME \
+ --location $LOCATION \
+ --sku Standard_LRS \
+ --enable-hierarchical-namespace true \
+ --kind StorageV2
+echo
+
+STORAGE_ACCOUNT_ID=$(az storage account list | jq -r ".[] | select(.name==\"$STORAGE_ACCOUNT_NAME\") | .id")
+
+echo az resource update --ids="$STORAGE_ACCOUNT_ID" --set properties.allowBlobPublicAccess=true
+az resource update --ids="$STORAGE_ACCOUNT_ID" --set properties.allowBlobPublicAccess=true
+
+echo "Waiting for 30 seconds before creating containers on the new storage account"
+sleep 30
+
+echo az storage container create --name iati-xml --account-name $STORAGE_ACCOUNT_NAME --public-access container
+az storage container create --name iati-xml --account-name $STORAGE_ACCOUNT_NAME --public-access container | jq
+
+echo az storage container create --name iati-zip --account-name $STORAGE_ACCOUNT_NAME --public-access container
+az storage container create --name iati-zip --account-name $STORAGE_ACCOUNT_NAME --public-access container | jq
+
+az storage blob service-properties update --account-name $STORAGE_ACCOUNT_NAME \
+ --static-website --404-document 404.html \
+ --index-document index.html
+
+echo az storage account show-connection-string --name $STORAGE_ACCOUNT_NAME \
+ --resource-group $RESOURCE_GROUP_NAME \
+ \| jq -r '.connectionString'
+
+STORAGE_ACCOUNT_CONNECTION_STRING=$(az storage account show-connection-string --name $STORAGE_ACCOUNT_NAME --resource-group $RESOURCE_GROUP_NAME | jq -r '.connectionString')
+
+
+WEB_BASE_URL="https://$STORAGE_ACCOUNT_NAME.blob.core.windows.net"
+# $(az storage account show -n $STORAGE_ACCOUNT_NAME -g $RESOURCE_GROUP_NAME --query "primaryEndpoints.web" --output tsv)
+
+sed -e "s#{{WEB_BASE_URL}}#$WEB_BASE_URL#" web/index-template.html > web/index.html
+
+az storage blob upload-batch -s web -d '$web' --account-name $STORAGE_ACCOUNT_NAME --overwrite
+
+# Provision Postgres Server
+echo az postgres flexible-server create -y -g $RESOURCE_GROUP_NAME \
+ -n $POSTGRES_SERVER_NAME --location $LOCATION \
+ --admin-user bds --admin-password $BDS_DB_ADMIN_PASSWORD \
+ --sku-name Standard_B1ms --tier Burstable --storage-size 32
+az postgres flexible-server create -y -g $RESOURCE_GROUP_NAME \
+ -n $POSTGRES_SERVER_NAME --location $LOCATION \
+ --admin-user bds --admin-password $BDS_DB_ADMIN_PASSWORD \
+ --sku-name Standard_B1ms --tier Burstable --storage-size 32
+
+# Create Postgres database
+echo az postgres flexible-server db create --resource-group $RESOURCE_GROUP_NAME \
+ --server-name $POSTGRES_SERVER_NAME \
+ --database-name bulk_data_service_db
+az postgres flexible-server db create --resource-group $RESOURCE_GROUP_NAME \
+ --server-name $POSTGRES_SERVER_NAME \
+ --database-name bulk_data_service_db
+
+
+# Add firewall rule to let other Azure resources access the database
+echo az postgres flexible-server firewall-rule create --resource-group $RESOURCE_GROUP_NAME \
+ --name $POSTGRES_SERVER_NAME \
+ --rule-name allowazureservices \
+ --start-ip-address 0.0.0.0
+az postgres flexible-server firewall-rule create --resource-group $RESOURCE_GROUP_NAME \
+ --name $POSTGRES_SERVER_NAME \
+ --rule-name allowazureservices \
+ --start-ip-address 0.0.0.0
+
+# Increase the maximum number of connections
+echo az postgres flexible-server parameter set --resource-group $RESOURCE_GROUP_NAME \
+ --server-name $POSTGRES_SERVER_NAME \
+ --name "max_connections" \
+ --value 85
+az postgres flexible-server parameter set --resource-group $RESOURCE_GROUP_NAME \
+ --server-name $POSTGRES_SERVER_NAME \
+ --name "max_connections" \
+ --value 85
+
+# create Azure service-principal
+
+RESOURCE_GROUP_ID_STRING=$(az group list --query "[?name=='$RESOURCE_GROUP_NAME']" | jq -r '.[0].id')
+
+echo az ad sp create-for-rbac --name $SERVICE_PRINCIPAL_NAME \
+ --role contributor \
+ --scopes $RESOURCE_GROUP_ID_STRING
+SP_DETAILS=$(az ad sp create-for-rbac --name $SERVICE_PRINCIPAL_NAME \
+ --role contributor \
+ --scopes $RESOURCE_GROUP_ID_STRING)
+
+CREDS=$(echo $SP_DETAILS | jq "with_entries(if .key == \"appId\" then .key = \"clientId\" elif .key == \"tenant\" then .key = \"tenantId\" elif .key == \"password\" then .key = \"clientSecret\" else . end) | . += { \"subscriptionId\" : \"$SUBSCRIPTION_ID\" } | del(.displayName)")
+
+echo
+echo
+echo "--------------------------------------------------"
+echo "Credentials to put into the Github repo's secrets:"
+echo
+
+echo "JSON credentials for Azure: (Secret name: ${TARGET_ENVIRONMENT_ENV_VAR}_AZURE_CREDENTIALS)"
+
+echo $CREDS
+
+echo "Azure storage connection string: (Secret name ${TARGET_ENVIRONMENT_ENV_VAR}_AZURE_STORAGE_CONNECTION_STRING)"
+
+echo $STORAGE_ACCOUNT_CONNECTION_STRING
+
+echo "Database host: (Secret name: ${TARGET_ENVIRONMENT_ENV_VAR}_DB_HOST)"
+
+echo $POSTGRES_SERVER_NAME
+
+echo "Database name: (Secret name: ${TARGET_ENVIRONMENT_ENV_VAR}_DB_PORT)"
+
+echo 5432
+
+echo "Database name: (Secret name: ${TARGET_ENVIRONMENT_ENV_VAR}_DB_NAME)"
+
+echo bulk_data_service_db
+
+echo "Database name: (Secret name: ${TARGET_ENVIRONMENT_ENV_VAR}_DB_USER)"
+
+echo bds
+
+echo "Database name: (Secret name: ${TARGET_ENVIRONMENT_ENV_VAR}_DB_PASS)"
+
+echo $BDS_DB_ADMIN_PASSWORD
+
+echo "Log analytics workspace ID: (Secret name: ${TARGET_ENVIRONMENT_ENV_VAR}_LOG_WORKSPACE_ID)"
+
+echo $LOG_ANALYTICS_WORKSPACE_ID
+
+echo "Log analytics workspace key: (Secret name: ${TARGET_ENVIRONMENT_ENV_VAR}_LOG_WORKSPACE_KEY)"
+
+echo $LOG_ANALYTICS_WORKSPACE_KEY
diff --git a/db-migrations/20240531_01_iY5Qa.rollback.sql b/db-migrations/20240531_01_iY5Qa.rollback.sql
new file mode 100644
index 0000000..46d2e27
--- /dev/null
+++ b/db-migrations/20240531_01_iY5Qa.rollback.sql
@@ -0,0 +1,4 @@
+--
+-- depends:
+
+drop table iati_datasets;
diff --git a/db-migrations/20240531_01_iY5Qa.sql b/db-migrations/20240531_01_iY5Qa.sql
new file mode 100644
index 0000000..91e6af6
--- /dev/null
+++ b/db-migrations/20240531_01_iY5Qa.sql
@@ -0,0 +1,71 @@
+--
+-- depends:
+
+-- auto-generated definition
+create table iati_datasets
+(
+ id uuid not null,
+ name varchar not null,
+ publisher_id uuid not null,
+ publisher_name varchar not null,
+ type varchar not null,
+ source_url varchar,
+ hash varchar,
+ hash_excluding_generated_timestamp varchar,
+ last_update_check timestamp with time zone,
+ last_head_attempt timestamp with time zone,
+ last_head_http_status smallint,
+ head_error_message varchar,
+ last_download_attempt timestamp with time zone,
+ last_download_http_status smallint,
+ last_successful_download timestamp with time zone,
+ last_verified_on_server timestamp with time zone,
+ download_error_message varchar,
+ content_modified timestamp with time zone,
+ content_modified_excluding_generated_timestamp timestamp with time zone,
+ server_header_last_modified timestamp with time zone,
+ server_header_etag varchar
+);
+
+comment on column iati_datasets.name is 'the short name of the dataset';
+
+comment on column iati_datasets.type is 'type of the IATI XML file: activity or organisation';
+
+comment on column iati_datasets.source_url is 'the source url of dataset on the publisher website';
+
+comment on column iati_datasets.hash is 'hash of dataset as generated by Bulk Data Service';
+
+comment on column iati_datasets.hash_excluding_generated_timestamp is 'hash of dataset content excluding the generated-datetime attribute as generated by Bulk Data Service';
+
+comment on column iati_datasets.last_update_check is 'the last time a the status of the dataset was updated or checked';
+
+comment on column iati_datasets.last_head_attempt is 'the last time an HTTP HEAD request was attempted';
+
+comment on column iati_datasets.last_head_http_status is 'the HTTP status code of the last HEAD request';
+
+comment on column iati_datasets.head_error_message is 'the error message of the last HEAD request (if any)';
+
+comment on column iati_datasets.last_download_attempt is 'the last time a full download of dataset was attempted';
+
+comment on column iati_datasets.last_download_http_status is 'the HTTP status code of the last download attempt';
+
+comment on column iati_datasets.last_successful_download is 'the time of the last successful download of the dataset';
+
+comment on column iati_datasets.last_verified_on_server is 'the last time data was successfully downloaded or confirmed as unchanged by successful HEAD request';
+
+comment on column iati_datasets.download_error_message is 'the error message of the last full download attempt (if any)';
+
+comment on column iati_datasets.content_modified is 'the time at which any part of the IATI file changed';
+
+comment on column iati_datasets.content_modified_excluding_generated_timestamp is 'the time at which the substantive contents of the file changed, excluding the ''generated-datetime'' attribute ';
+
+comment on column iati_datasets.server_header_last_modified is 'the contents of the Last-Modified header on source server hosting the dataset ';
+
+comment on column iati_datasets.server_header_etag is 'the contents of the ETag header on source server hosting the dataset ';
+
+alter table iati_datasets
+ owner to bds;
+
+create unique index iati_datasets_pk
+ on iati_datasets (id);
+
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..2543a06
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,32 @@
+
+services:
+
+ iati-bulk-data-service-azurite:
+ image: mcr.microsoft.com/azure-storage/azurite
+ ports:
+ - 10000:10000
+ - 10001:10001
+ - 10002:10002
+ command: azurite --blobHost 0.0.0.0 --blobPort 10000 --queueHost 0.0.0.0 --queuePort 10001 --tableHost 0.0.0.0 --tablePort 10002 --location /data --loose --skipApiVersionCheck
+ volumes:
+ - iati-bulk-data-service-azurite:/data
+
+ iati-bulk-data-service-postgres:
+ image: postgres:15
+ ports:
+ - "${DB_PORT}:${DB_PORT}"
+ environment:
+ - POSTGRES_DB=${DB_NAME}
+ - POSTGRES_USER=${DB_USER}
+ - POSTGRES_PASSWORD=${DB_PASS}
+ - PGPORT=${DB_PORT}
+ - PGDATA=/var/lib/postgresql/data/pgdata
+ volumes:
+ - iati-bulk-data-service-postgres:/var/lib/postgresql/data
+
+volumes:
+ iati-bulk-data-service-azurite:
+ driver: local
+ iati-bulk-data-service-postgres:
+ driver: local
+
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..0503209
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,71 @@
+[project]
+name = "bulk-data-service"
+version = "0.0.1"
+requires-python = ">= 3.12"
+readme = "README.md"
+dependencies = [
+ "azure-storage-blob==12.20.0",
+ "psycopg[binary,pool]==3.1.18",
+ "requests==2.31.0",
+ "yoyo-migrations==8.2.0",
+ "prometheus-client==0.20.0"
+]
+
+
+[project.optional-dependencies]
+dev = [
+ "pip-tools",
+ "isort",
+ "mypy",
+ "pytest",
+ "black",
+ "flake8",
+ "flake8-pyproject",
+ "types-requests",
+ "python-dotenv"
+]
+
+
+[tool.pip-tools]
+strip-extras = true
+
+
+[tool.pytest.ini_options]
+testpaths = ["tests/unit", "tests/integration"]
+addopts = [
+ "--import-mode=importlib"
+]
+pythonpath = [
+ "src", "tests"
+]
+filterwarnings = [
+ "ignore::DeprecationWarning:yoyo.*:", # ignore deprecations from all modules
+ "ignore::DeprecationWarning:yoyo.*:", # ignore deprecations from all modules
+ "default::DeprecationWarning:bulk_data_service.*:", # except the app
+]
+
+
+[tool.isort]
+py_version=312
+extend_skip = ['__pycache__', '.mypy_cache', '.ve', '.venv', '.vagrant-ve']
+skip_gitignore = true
+src_paths = ['src', 'tests']
+line_length = 120
+
+
+[tool.mypy]
+files=["src/"]
+mypy_path=["tests/"]
+
+
+[tool.flake8]
+max-line-length = 120
+extend_ignore = ['E203', 'W503', 'E275']
+exclude = ['__pycache__', '.mypy_cache', '.pytest_cache', '.ve', '.venv', '.vagrant-ve']
+max_complexity = 7
+
+
+[tool.black]
+line-length = 120
+target-version = ['py312']
+include='''src/.*/*.py$'''
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..9169a7e
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,120 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+# pip-compile --extra=dev --output-file=requirements-dev.txt --strip-extras pyproject.toml
+#
+azure-core==1.30.2
+ # via azure-storage-blob
+azure-storage-blob==12.20.0
+ # via bulk-data-service (pyproject.toml)
+black==24.4.2
+ # via bulk-data-service (pyproject.toml)
+build==1.2.1
+ # via pip-tools
+certifi==2024.7.4
+ # via requests
+cffi==1.16.0
+ # via cryptography
+charset-normalizer==3.3.2
+ # via requests
+click==8.1.7
+ # via
+ # black
+ # pip-tools
+cryptography==42.0.8
+ # via azure-storage-blob
+flake8==7.1.0
+ # via
+ # bulk-data-service (pyproject.toml)
+ # flake8-pyproject
+flake8-pyproject==1.2.3
+ # via bulk-data-service (pyproject.toml)
+idna==3.7
+ # via requests
+importlib-metadata==8.0.0
+ # via yoyo-migrations
+iniconfig==2.0.0
+ # via pytest
+isodate==0.6.1
+ # via azure-storage-blob
+isort==5.13.2
+ # via bulk-data-service (pyproject.toml)
+mccabe==0.7.0
+ # via flake8
+mypy==1.10.1
+ # via bulk-data-service (pyproject.toml)
+mypy-extensions==1.0.0
+ # via
+ # black
+ # mypy
+packaging==24.1
+ # via
+ # black
+ # build
+ # pytest
+pathspec==0.12.1
+ # via black
+pip-tools==7.4.1
+ # via bulk-data-service (pyproject.toml)
+platformdirs==4.2.2
+ # via black
+pluggy==1.5.0
+ # via pytest
+prometheus-client==0.20.0
+ # via bulk-data-service (pyproject.toml)
+psycopg==3.1.18
+ # via bulk-data-service (pyproject.toml)
+psycopg-binary==3.1.18
+ # via psycopg
+psycopg-pool==3.2.2
+ # via psycopg
+pycodestyle==2.12.0
+ # via flake8
+pycparser==2.22
+ # via cffi
+pyflakes==3.2.0
+ # via flake8
+pyproject-hooks==1.1.0
+ # via
+ # build
+ # pip-tools
+pytest==8.2.2
+ # via bulk-data-service (pyproject.toml)
+python-dotenv==1.0.1
+ # via bulk-data-service (pyproject.toml)
+requests==2.31.0
+ # via
+ # azure-core
+ # bulk-data-service (pyproject.toml)
+six==1.16.0
+ # via
+ # azure-core
+ # isodate
+sqlparse==0.5.0
+ # via yoyo-migrations
+tabulate==0.9.0
+ # via yoyo-migrations
+types-requests==2.32.0.20240622
+ # via bulk-data-service (pyproject.toml)
+typing-extensions==4.12.2
+ # via
+ # azure-core
+ # azure-storage-blob
+ # mypy
+ # psycopg
+ # psycopg-pool
+urllib3==2.2.2
+ # via
+ # requests
+ # types-requests
+wheel==0.43.0
+ # via pip-tools
+yoyo-migrations==8.2.0
+ # via bulk-data-service (pyproject.toml)
+zipp==3.19.2
+ # via importlib-metadata
+
+# The following packages are considered to be unsafe in a requirements file:
+# pip
+# setuptools
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..a37dbd9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,58 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+# pip-compile --strip-extras pyproject.toml
+#
+azure-core==1.30.2
+ # via azure-storage-blob
+azure-storage-blob==12.20.0
+ # via bulk-data-service (pyproject.toml)
+certifi==2024.7.4
+ # via requests
+cffi==1.16.0
+ # via cryptography
+charset-normalizer==3.3.2
+ # via requests
+cryptography==42.0.8
+ # via azure-storage-blob
+idna==3.7
+ # via requests
+importlib-metadata==8.0.0
+ # via yoyo-migrations
+isodate==0.6.1
+ # via azure-storage-blob
+prometheus-client==0.20.0
+ # via bulk-data-service (pyproject.toml)
+psycopg==3.1.18
+ # via bulk-data-service (pyproject.toml)
+psycopg-binary==3.1.18
+ # via psycopg
+psycopg-pool==3.2.2
+ # via psycopg
+pycparser==2.22
+ # via cffi
+requests==2.31.0
+ # via
+ # azure-core
+ # bulk-data-service (pyproject.toml)
+six==1.16.0
+ # via
+ # azure-core
+ # isodate
+sqlparse==0.5.0
+ # via yoyo-migrations
+tabulate==0.9.0
+ # via yoyo-migrations
+typing-extensions==4.12.2
+ # via
+ # azure-core
+ # azure-storage-blob
+ # psycopg
+ # psycopg-pool
+urllib3==2.2.2
+ # via requests
+yoyo-migrations==8.2.0
+ # via bulk-data-service (pyproject.toml)
+zipp==3.19.2
+ # via importlib-metadata
diff --git a/src/bulk_data_service/__init__.py b/src/bulk_data_service/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/bulk_data_service/checker.py b/src/bulk_data_service/checker.py
new file mode 100644
index 0000000..5d6d417
--- /dev/null
+++ b/src/bulk_data_service/checker.py
@@ -0,0 +1,85 @@
+import datetime
+import time
+import traceback
+import uuid
+
+from bulk_data_service.dataset_indexing import create_and_upload_indices
+from bulk_data_service.dataset_remover import remove_deleted_datasets_from_bds, remove_expired_downloads
+from bulk_data_service.dataset_updater import add_or_update_datasets
+from bulk_data_service.zipper import zipper_run
+from dataset_registration.registration_services import get_registered_datasets
+from utilities.db import get_datasets_in_bds
+from utilities.prometheus import initialise_prometheus_client, update_metrics_from_db
+
+
+def checker(context: dict):
+ context = initialise_prometheus_client(context)
+
+ if context["single_run"]:
+ checker_run(context, get_datasets_in_bds(context))
+ else:
+ checker_service_loop(context)
+
+
+def checker_service_loop(context: dict):
+
+ datasets_in_zip = {} # type: dict[uuid.UUID, dict]
+ datasets_in_bds = get_datasets_in_bds(context)
+
+ while True:
+ try:
+
+ checker_run(context, datasets_in_bds)
+
+ zipper_run(context, datasets_in_zip, datasets_in_bds)
+
+ time.sleep(60 * 30)
+
+ except Exception as e:
+ context["logger"].error(
+ "Unknown exception in checker service loop. "
+ "Waiting 10 minutes then restarting. "
+ "Exception message: {}".format(e).replace("\n", "")
+ )
+ context["logger"].error("Full traceback: " "{}".format(traceback.format_exc()))
+
+ context["prom_metrics"]["number_crashes"].inc()
+
+ time.sleep(60 * 10)
+
+
+def checker_run(context: dict, datasets_in_bds: dict[uuid.UUID, dict]):
+ run_start = datetime.datetime.now(datetime.UTC)
+
+ context["logger"].info("Checker starting run")
+
+ try:
+ registered_datasets = get_registered_datasets(context)
+ except RuntimeError as e:
+ context["logger"].error(
+ "Unable to download list of datasets from registration service. " "Details: {}".format(e)
+ )
+ context["logger"].error("Checker aborted.")
+ return
+
+ remove_deleted_datasets_from_bds(context, datasets_in_bds, registered_datasets)
+
+ add_or_update_datasets(context, datasets_in_bds, registered_datasets)
+
+ remove_expired_downloads(context, datasets_in_bds)
+
+ create_and_upload_indices(context, datasets_in_bds)
+
+ update_metrics_from_db(context)
+
+ run_end = datetime.datetime.now(datetime.UTC)
+
+ context["prom_metrics"]["checker_run_duration"].set((run_end - run_start).seconds)
+
+ context["logger"].info(
+ "Checker finished in {}. Datasets processed: {}. Seconds per dataset: {}".format(
+ run_end - run_start,
+ len(registered_datasets),
+ ((run_end - run_start) / len(registered_datasets)).total_seconds(),
+ )
+ )
diff --git a/src/bulk_data_service/dataset_indexing.py b/src/bulk_data_service/dataset_indexing.py
new file mode 100644
index 0000000..58a741e
--- /dev/null
+++ b/src/bulk_data_service/dataset_indexing.py
@@ -0,0 +1,82 @@
+import json
+import uuid
+from typing import Any
+
+from azure.storage.blob import BlobServiceClient
+
+from utilities.azure import azure_upload_to_blob, get_azure_blob_public_url
+from utilities.misc import get_timestamp
+
+
+def create_and_upload_indices(context: dict, datasets_in_bds: dict[uuid.UUID, dict]):
+
+ minimal_index = create_index_json(context, datasets_in_bds, "minimal")
+
+ full_index = create_index_json(context, datasets_in_bds, "full")
+
+ upload_index_json_to_azure(context, get_index_name(context, "minimal"), minimal_index)
+
+ upload_index_json_to_azure(context, get_index_name(context, "full"), full_index)
+
+
+def upload_index_json_to_azure(context: dict, index_name: str, index_json: str):
+
+ az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
+
+ for container in [
+ context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"],
+ context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"],
+ ]:
+
+ azure_upload_to_blob(az_blob_service, container, index_name, index_json, "application/json")
+
+ az_blob_service.close()
+
+
+def create_index_json(context: dict, datasets_in_bds: dict[uuid.UUID, dict], index_type: str) -> str:
+
+ index = {"index_created": get_timestamp(), "datasets": {}}
+
+ index["datasets"] = get_dataset_index(context, datasets_in_bds, index_type)
+
+ return json.dumps(index, default=str, sort_keys=True, indent=True)
+
+
+def get_dataset_index(context: dict, datasets_in_bds: dict[uuid.UUID, dict], index_type: str) -> dict:
+ return {v["name"]: get_index_item(context, v, index_type) for _, v in datasets_in_bds.items()}
+
+
+def get_index_item(context: dict, dataset: dict, index_type: str) -> dict[str, Any]:
+
+ if index_type == "minimal":
+ dataset_index = {k: v for k, v in dataset.items() if k in get_minimal_index_fields(context)}
+ else:
+ dataset_index = {k: v for k, v in dataset.items()}
+
+ dataset_index["url_xml"] = ""
+ dataset_index["url_zip"] = ""
+
+ if dataset_index["last_successful_download"] is not None:
+ dataset_index["url_xml"] = get_azure_blob_public_url(context, dataset, "xml")
+ dataset_index["url_zip"] = get_azure_blob_public_url(context, dataset, "zip")
+
+ return dataset_index
+
+
+def get_index_name(context: dict, index_type: str) -> str:
+ if index_type not in ["minimal", "full"]:
+ raise ValueError("Unknown type for dataset index")
+
+ return "dataset-index-{}.json".format(index_type)
+
+
+def get_minimal_index_fields(context: dict) -> list:
+ return [
+ "id",
+ "name",
+ "publisher_name",
+ "source_url",
+ "hash",
+ "hash_excluding_generated_timestamp",
+ "last_successful_download",
+ ]
diff --git a/src/bulk_data_service/dataset_remover.py b/src/bulk_data_service/dataset_remover.py
new file mode 100644
index 0000000..0326189
--- /dev/null
+++ b/src/bulk_data_service/dataset_remover.py
@@ -0,0 +1,96 @@
+import uuid
+from datetime import timedelta
+from typing import Any
+
+import psycopg
+from azure.storage.blob import BlobServiceClient
+
+from utilities.azure import delete_azure_iati_blob
+from utilities.db import get_db_connection, insert_or_update_dataset, remove_dataset_from_db
+from utilities.misc import get_timestamp
+
+
+def remove_deleted_datasets_from_bds(
+ context: dict[str, Any], datasets_in_bds: dict[uuid.UUID, dict], registered_datasets: dict[uuid.UUID, dict]
+):
+
+ db_conn = get_db_connection(context)
+
+ az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
+
+ ids_to_delete = [k for k in datasets_in_bds.keys() if k not in registered_datasets]
+
+ context["prom_metrics"]["datasets_unregistered"].set(len(ids_to_delete))
+
+ for id in ids_to_delete:
+
+ context["logger"].info(
+ "dataset id: {} - Dataset no longer exists in registration "
+ "service so removing from Bulk Data Service".format(id)
+ )
+
+ remove_dataset_from_db(db_conn, id)
+
+ delete_azure_iati_blob(context, az_blob_service, datasets_in_bds[id], "xml")
+
+ delete_azure_iati_blob(context, az_blob_service, datasets_in_bds[id], "zip")
+
+ del datasets_in_bds[id]
+
+ az_blob_service.close()
+
+ db_conn.close()
+
+
+def remove_expired_downloads(context: dict[str, Any], datasets_in_bds: dict[uuid.UUID, dict]):
+
+ db_conn = get_db_connection(context)
+
+ az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
+
+ expired_datasets = 0
+
+ for dataset in datasets_in_bds.values():
+ if dataset_has_expired(context, dataset):
+ remove_download_for_expired_dataset(context, db_conn, az_blob_service, dataset)
+ expired_datasets += 1
+
+ context["prom_metrics"]["datasets_expired"].set(expired_datasets)
+
+ az_blob_service.close()
+
+ db_conn.close()
+
+
+def remove_download_for_expired_dataset(
+ context: dict[str, Any], db_conn: psycopg.Connection, az_blob_service: BlobServiceClient, bds_dataset: dict
+) -> dict:
+
+ max_hours = int(context["REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS"])
+
+ context["logger"].info(
+ "dataset id: {} - Last good download for dataset "
+ "is over max threshold of {} hours, so removing "
+ "last good download from Bulk Data Service".format(bds_dataset["id"], max_hours)
+ )
+
+ bds_dataset["last_successful_download"] = None
+ bds_dataset["hash"] = None
+ bds_dataset["hash_excluding_generated_timestamp"] = None
+
+ insert_or_update_dataset(db_conn, bds_dataset)
+
+ delete_azure_iati_blob(context, az_blob_service, bds_dataset, "xml")
+
+ delete_azure_iati_blob(context, az_blob_service, bds_dataset, "zip")
+
+ return bds_dataset
+
+
+def dataset_has_expired(context: dict[str, Any], bds_dataset: dict) -> bool:
+
+ max_hours = int(context["REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS"])
+
+ return bds_dataset["last_successful_download"] is not None and bds_dataset["last_successful_download"] < (
+ get_timestamp() - timedelta(hours=max_hours)
+ )
diff --git a/src/bulk_data_service/dataset_updater.py b/src/bulk_data_service/dataset_updater.py
new file mode 100644
index 0000000..a33a4ef
--- /dev/null
+++ b/src/bulk_data_service/dataset_updater.py
@@ -0,0 +1,308 @@
+import concurrent.futures
+import uuid
+from datetime import datetime, timedelta
+from itertools import batched
+from random import random
+
+import psycopg
+import requests
+from azure.storage.blob import BlobServiceClient
+
+from utilities.azure import azure_upload_to_blob
+from utilities.db import get_db_connection, insert_or_update_dataset
+from utilities.http import get_requests_session, http_download_dataset, http_head_dataset
+from utilities.misc import (
+ get_hash,
+ get_hash_excluding_generated_timestamp,
+ get_timestamp,
+ set_timestamp_tz_utc,
+ zip_data_as_single_file,
+)
+
+
+def add_or_update_datasets(
+ context: dict, datasets_in_bds: dict[uuid.UUID, dict], registered_datasets: dict[uuid.UUID, dict]
+):
+
+ context["prom_metrics"]["total_number_of_datasets"].set(len(registered_datasets))
+ context["prom_metrics"]["datasets_added"].set(len(registered_datasets) - len(datasets_in_bds))
+
+ threads = []
+
+ num_batches = int(len(registered_datasets) / int(context["NUMBER_DOWNLOADER_THREADS"])) + 1
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=num_batches) as executor:
+ for dataset_batch_ids in batched(registered_datasets, num_batches):
+
+ dataset_batch = {k: registered_datasets[k] for k in dataset_batch_ids}
+
+ threads.append(executor.submit(add_or_update_dataset_batch, context, datasets_in_bds, dataset_batch))
+
+ for future in concurrent.futures.as_completed(threads):
+ future.result()
+
+
+def add_or_update_dataset_batch(
+ context: dict, datasets_in_bds: dict[uuid.UUID, dict], registered_datasets_to_update: dict[uuid.UUID, dict]
+):
+
+ db_conn = get_db_connection(context)
+
+ az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
+
+ session = get_requests_session()
+
+ for registered_dataset_id in registered_datasets_to_update:
+
+ add_or_update_registered_dataset(
+ context,
+ registered_dataset_id,
+ datasets_in_bds,
+ registered_datasets_to_update,
+ az_blob_service,
+ session,
+ db_conn,
+ )
+
+ session.close()
+
+ az_blob_service.close()
+
+ db_conn.close()
+
+
+def add_or_update_registered_dataset(
+ context: dict,
+ registered_dataset_id: uuid.UUID,
+ datasets_in_bds: dict[uuid.UUID, dict],
+ registered_datasets: dict[uuid.UUID, dict],
+ az_blob_service: BlobServiceClient,
+ session: requests.Session,
+ db_conn: psycopg.Connection,
+):
+
+ if registered_dataset_id not in datasets_in_bds:
+ bds_dataset = create_bds_dataset(registered_datasets[registered_dataset_id])
+ datasets_in_bds[registered_dataset_id] = bds_dataset
+ else:
+ bds_dataset = datasets_in_bds[registered_dataset_id]
+ update_bds_dataset_registration_info(bds_dataset, registered_datasets[registered_dataset_id])
+
+ attempt_download = True
+
+ bds_dataset["last_update_check"] = get_timestamp()
+
+ download_within_hours = get_randomised_download_within_hours(context)
+
+ if dataset_downloaded_within(bds_dataset, download_within_hours):
+
+ attempt_download = check_dataset_etag_last_mod_header(
+ context, db_conn, session, bds_dataset, download_within_hours
+ )
+
+ if attempt_download:
+ try:
+ download_and_save_dataset(context, session, az_blob_service, bds_dataset)
+
+ datasets_in_bds[registered_dataset_id] = bds_dataset
+
+ insert_or_update_dataset(db_conn, bds_dataset)
+
+ context["logger"].info("dataset id: {} - Added/updated dataset".format(bds_dataset["id"]))
+
+ except RuntimeError as e:
+ bds_dataset["download_error_message"] = "Download of IATI XML failed with non-200 HTTP status: {}".format(e)
+ context["logger"].warning(
+ "dataset id: {} - {}".format(registered_dataset_id, bds_dataset["download_error_message"])
+ )
+ bds_dataset["last_download_attempt"] = get_timestamp()
+ bds_dataset["last_download_http_status"] = e.args[0]["http_status_code"]
+ insert_or_update_dataset(db_conn, bds_dataset)
+ except Exception as e:
+ bds_dataset["last_download_attempt"] = get_timestamp()
+ bds_dataset["download_error_message"] = (
+ "Download of IATI XML produced EXCEPTION with GET request: {}".format(e)
+ )
+ context["logger"].warning(
+ "dataset id: {} - {}".format(registered_dataset_id, bds_dataset["download_error_message"])
+ )
+ insert_or_update_dataset(db_conn, bds_dataset)
+
+
+def get_randomised_download_within_hours(context: dict) -> int:
+ hours_force_redownload = int(context["FORCE_REDOWNLOAD_AFTER_HOURS"])
+
+ if hours_force_redownload > 8:
+ hours_force_redownload -= int(random() * 8)
+
+ return hours_force_redownload
+
+
+def dataset_downloaded_within(bds_dataset: dict, hours: int) -> bool:
+ hours_ago = get_timestamp() - timedelta(hours=hours)
+ return bds_dataset["last_successful_download"] is not None and bds_dataset["last_successful_download"] > hours_ago
+
+
+def check_dataset_etag_last_mod_header(
+ context: dict, db_conn: psycopg.Connection, session: requests.Session, bds_dataset: dict, download_within_hours: int
+) -> bool:
+
+ attempt_download = True
+
+ try:
+ head_response = http_head_dataset(session, bds_dataset["source_url"])
+
+ if "ETag" in head_response.headers and head_response.headers["ETag"] != bds_dataset["server_header_etag"]:
+
+ context["logger"].info(
+ "dataset id: {} - Last successful download within {} hours, "
+ "but ETag changed so redownloading".format(bds_dataset["id"], download_within_hours)
+ )
+
+ update_dataset_head_request_fields(bds_dataset, head_response.status_code)
+
+ elif "Last-Modified" in head_response.headers and set_timestamp_tz_utc(
+ datetime.strptime(head_response.headers["Last-Modified"], "%a, %d %b %Y %H:%M:%S GMT")
+ ) != set_timestamp_tz_utc(bds_dataset["server_header_last_modified"]):
+
+ context["logger"].info(
+ "dataset id: {} - Last successful download within {} hours, "
+ "but Last-Modified header changed so redownloading".format(bds_dataset["id"], download_within_hours)
+ )
+
+ update_dataset_head_request_fields(bds_dataset, head_response.status_code)
+
+ else:
+ context["logger"].info(
+ "dataset id: {} - Last successful download within {} hours, "
+ "Last-Modified and ETag same, so not redownloading".format(bds_dataset["id"], download_within_hours)
+ )
+
+ update_dataset_head_request_fields(bds_dataset, head_response.status_code)
+
+ bds_dataset["last_verified_on_server"] = bds_dataset["last_head_attempt"]
+
+ insert_or_update_dataset(db_conn, bds_dataset)
+
+ attempt_download = False
+
+ except RuntimeError as e:
+
+ if dataset_downloaded_within(bds_dataset, 6):
+ extra_err_message = "Dataset downloaded within the last 6 hours so not " "forcing full re-download attempt."
+ attempt_download = False
+ else:
+ extra_err_message = "Dataset not downloaded within the last 6 hours so " "forcing full re-download attempt."
+ attempt_download = True
+
+ bds_dataset["head_error_message"] = (
+ "Last successful download within {} hours, "
+ "but HEAD request to check ETag/Last-Modified "
+ "return non-200 status. {} "
+ "HEAD request exception details: {}".format(download_within_hours, extra_err_message, e)
+ )
+
+ context["logger"].warning("dataset id: {} - {}".format(bds_dataset["id"], bds_dataset["head_error_message"]))
+
+ update_dataset_head_request_fields(
+ bds_dataset, e.args[0]["http_status_code"], bds_dataset["head_error_message"]
+ )
+
+ insert_or_update_dataset(db_conn, bds_dataset)
+
+ except Exception as e:
+ context["logger"].warning(
+ "dataset id: {} - EXCEPTION with HEAD request, details: " "{}".format(bds_dataset["id"], e)
+ )
+
+ return attempt_download
+
+
+def download_and_save_dataset(
+ context: dict, session: requests.Session, az_blob_service: BlobServiceClient, bds_dataset: dict
+):
+
+ last_download_attempt = get_timestamp()
+
+ download_response = http_download_dataset(session, bds_dataset["source_url"])
+
+ hash = get_hash(download_response.text)
+ hash_excluding_generated = get_hash_excluding_generated_timestamp(download_response.text)
+
+ if hash == bds_dataset["hash"]:
+ context["logger"].info(
+ "dataset id: {} - Hash of download is identical to "
+ "previous value, so not re-zipping and re-uploading to Azure".format(bds_dataset["id"])
+ )
+ else:
+ iati_xml_zipped = zip_data_as_single_file(bds_dataset["name"] + ".xml", download_response.text)
+
+ azure_upload_to_blob(
+ az_blob_service,
+ context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"],
+ "{}/{}.xml".format(bds_dataset["publisher_name"], bds_dataset["name"]),
+ download_response.text,
+ "application/xml",
+ )
+
+ azure_upload_to_blob(
+ az_blob_service,
+ context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"],
+ "{}/{}.zip".format(bds_dataset["publisher_name"], bds_dataset["name"]),
+ iati_xml_zipped,
+ "application/zip",
+ )
+
+ bds_dataset.update(
+ {
+ "hash": hash,
+ "hash_excluding_generated_timestamp": hash_excluding_generated,
+ "last_update_check": last_download_attempt,
+ "last_download_attempt": last_download_attempt,
+ "last_download_http_status": download_response.status_code,
+ "last_successful_download": last_download_attempt,
+ "last_verified_on_server": last_download_attempt,
+ "download_error_message": None,
+ "content_modified": None,
+ "content_modified_excluding_generated_timestamp": None,
+ "server_header_last_modified": download_response.headers.get("Last-Modified", None),
+ "server_header_etag": download_response.headers.get("ETag", None),
+ }
+ )
+
+
+def update_dataset_head_request_fields(dataset: dict, status_code: int, error_msg: str = ""):
+ dataset["last_head_attempt"] = get_timestamp()
+ dataset["last_head_http_status"] = status_code
+ dataset["head_error_message"] = error_msg
+
+
+def create_bds_dataset(registered_dataset: dict) -> dict:
+ return {
+ "id": registered_dataset["id"],
+ "name": registered_dataset["name"],
+ "publisher_id": registered_dataset["publisher_id"],
+ "publisher_name": registered_dataset["publisher_name"],
+ "type": registered_dataset["type"],
+ "source_url": registered_dataset["source_url"],
+ "hash": None,
+ "hash_excluding_generated_timestamp": None,
+ "last_update_check": None,
+ "last_head_attempt": None,
+ "last_head_http_status": None,
+ "head_error_message": None,
+ "last_verified_on_server": None,
+ "last_download_attempt": None,
+ "last_download_http_status": None,
+ "last_successful_download": None,
+ "download_error_message": None,
+ "content_modified": None,
+ "content_modified_excluding_generated_timestamp": None,
+ "server_header_last_modified": None,
+ "server_header_etag": None,
+ }
+
+
+def update_bds_dataset_registration_info(bds_dataset: dict, registered_dataset: dict):
+ for field in ["publisher_id", "publisher_name", "type", "source_url"]:
+ bds_dataset[field] = registered_dataset[field]
diff --git a/src/bulk_data_service/zipper.py b/src/bulk_data_service/zipper.py
new file mode 100644
index 0000000..0bd3634
--- /dev/null
+++ b/src/bulk_data_service/zipper.py
@@ -0,0 +1,178 @@
+import datetime
+import os
+import shutil
+import time
+import uuid
+
+from azure.storage.blob import BlobServiceClient, ContentSettings
+
+from bulk_data_service.dataset_indexing import get_index_name
+from utilities.azure import azure_download_blob, get_azure_blob_name, get_azure_container_name
+from utilities.db import get_datasets_in_bds
+
+
+def zipper(context: dict):
+
+ datasets = get_datasets_in_bds(context)
+
+ if context["single_run"]:
+ zipper_run(context, {}, datasets)
+ else:
+ zipper_service_loop(context, {}, datasets)
+
+
+def zipper_service_loop(context: dict, datasets_in_zip: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict]):
+
+ while True:
+ zipper_run(context, datasets_in_zip, datasets_in_bds)
+
+ time.sleep(60 * 30)
+
+
+def zipper_run(context: dict, datasets_in_zip: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict]):
+
+ run_start = datetime.datetime.now(datetime.UTC)
+ context["logger"].info("Zipper starting run")
+
+ clean_working_dir(context, datasets_in_zip)
+
+ datasets_with_downloads = {k: v for k, v in datasets_in_bds.items() if v["last_successful_download"] is not None}
+
+ remove_datasets_without_dls_from_working_dir(context, datasets_in_zip, datasets_with_downloads)
+
+ new_or_updated_datasets = {
+ k: v
+ for k, v in datasets_with_downloads.items()
+ if k not in datasets_in_zip or datasets_in_zip[k]["hash"] != datasets_with_downloads[k]["hash"]
+ }
+
+ context["logger"].info(
+ "Found {} datasets to ZIP. {} are new or updated and will be (re-)downloaded.".format(
+ len(datasets_with_downloads), len(new_or_updated_datasets)
+ )
+ )
+
+ download_new_or_updated_to_working_dir(context, new_or_updated_datasets)
+
+ download_indices_to_working_dir(context)
+
+ context["logger"].info("Zipping {} datasets.".format(len(datasets_with_downloads)))
+ shutil.make_archive(
+ get_big_zip_local_pathname_no_extension(context),
+ "zip",
+ root_dir=context["ZIP_WORKING_DIR"],
+ base_dir="iati-data",
+ )
+
+ context["logger"].info("Uploading zipped datasets.")
+ upload_zip_to_azure(context)
+
+ run_end = datetime.datetime.now(datetime.UTC)
+ context["logger"].info(
+ "Zipper finished in {}. Datasets zipped: {}.".format(run_end - run_start, len(datasets_with_downloads))
+ )
+
+ datasets_in_zip.clear()
+ datasets_in_zip.update(datasets_with_downloads)
+
+
+def clean_working_dir(context: dict, datasets_in_zip: dict[uuid.UUID, dict]):
+ if len(datasets_in_zip) == 0:
+ context["logger"].info("First zip run of session, so deleting all XML " "files in the ZIP working dir.")
+ shutil.rmtree("{}/{}".format(context["ZIP_WORKING_DIR"], "iati-data"), ignore_errors=True)
+
+
+def remove_datasets_without_dls_from_working_dir(
+ context: dict, datasets_in_zip: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict]
+):
+ datasets_removed = {k: v for k, v in datasets_in_zip.items() if v["id"] not in datasets_in_bds}
+
+ for dataset in datasets_removed.values():
+ delete_local_xml_from_zip_working_dir(context, dataset)
+
+
+def upload_zip_to_azure(context: dict):
+ az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
+
+ blob_client = az_blob_service.get_blob_client(
+ context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"], get_big_zip_full_filename(context)
+ )
+
+ content_settings = ContentSettings(content_type="zip")
+
+ with open(get_big_zip_local_pathname(context), "rb") as data:
+ blob_client.upload_blob(data, overwrite=True, content_settings=content_settings)
+
+ az_blob_service.close()
+
+
+def download_indices_to_working_dir(context: dict):
+ az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
+
+ download_index_to_working_dir(context, az_blob_service, "minimal")
+
+ download_index_to_working_dir(context, az_blob_service, "full")
+
+ az_blob_service.close()
+
+
+def download_index_to_working_dir(context: dict, az_blob_service: BlobServiceClient, index_type: str):
+
+ index_filename = get_index_name(context, index_type)
+
+ index_full_pathname = "{}/iati-data/{}".format(context["ZIP_WORKING_DIR"], index_filename)
+
+ os.makedirs(os.path.dirname(index_full_pathname), exist_ok=True)
+
+ azure_download_blob(az_blob_service, get_azure_container_name(context, "xml"), index_filename, index_full_pathname)
+
+
+def download_new_or_updated_to_working_dir(context: dict, updated_datasets: dict[uuid.UUID, dict]):
+
+ az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
+
+ xml_container_name = get_azure_container_name(context, "xml")
+
+ for dataset in updated_datasets.values():
+ filename = get_local_pathname_dataset_xml(context, dataset)
+
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+ context["logger"].info("dataset id: {} - Downloading".format(dataset["id"]))
+
+ azure_download_blob(az_blob_service, xml_container_name, get_azure_blob_name(dataset, "xml"), filename)
+
+ az_blob_service.close()
+
+
+def get_big_zip_local_pathname_no_extension(context: dict) -> str:
+ return "{}/{}".format(context["ZIP_WORKING_DIR"], get_big_zip_base_filename(context))
+
+
+def get_big_zip_local_pathname(context: dict) -> str:
+ return "{}/{}".format(context["ZIP_WORKING_DIR"], get_big_zip_full_filename(context))
+
+
+def get_big_zip_base_filename(context: dict) -> str:
+ return "iati-data"
+
+
+def get_big_zip_full_filename(context: dict) -> str:
+ return "{}.zip".format(get_big_zip_base_filename(context))
+
+
+def get_local_pathname_dataset_xml(context: dict, dataset: dict) -> str:
+ return "{}/iati-data/datasets/{}".format(context["ZIP_WORKING_DIR"], get_azure_blob_name(dataset, "xml"))
+
+
+def delete_local_xml_from_zip_working_dir(context: dict, dataset: dict):
+ dataset_local_xml = get_local_pathname_dataset_xml(context, dataset)
+
+ if os.path.exists(dataset_local_xml):
+ try:
+ os.remove(dataset_local_xml)
+ except FileNotFoundError as e:
+ context["logger"].error(
+ "dataset id: {} - Error removing local XML file from "
+ "ZIP working dir. Details: {}.".format(dataset["id"], e)
+ )
diff --git a/src/config/__init__.py b/src/config/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/config/config.py b/src/config/config.py
new file mode 100644
index 0000000..bc0f0f2
--- /dev/null
+++ b/src/config/config.py
@@ -0,0 +1,30 @@
+import os
+
+_config_variables = [
+ "DATA_REGISTRATION",
+ "DATA_REGISTRY_BASE_URL",
+ "BLOB_STORAGE_BASE_PUBLIC_URL",
+ "NUMBER_DOWNLOADER_THREADS",
+ "FORCE_REDOWNLOAD_AFTER_HOURS",
+ "REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS",
+ "LOGFILE",
+ "ZIP_WORKING_DIR",
+ "DB_NAME",
+ "DB_USER",
+ "DB_PASS",
+ "DB_HOST",
+ "DB_PORT",
+ "DB_SSL_MODE",
+ "DB_CONNECTION_TIMEOUT",
+ "AZURE_STORAGE_CONNECTION_STRING",
+ "AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML",
+ "AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP",
+]
+
+
+def get_config() -> dict[str, str]:
+ config = {env_var: os.getenv(env_var, "") for env_var in _config_variables}
+
+ config["BLOB_STORAGE_BASE_PUBLIC_URL"] = config["BLOB_STORAGE_BASE_PUBLIC_URL"].strip("/")
+
+ return config
diff --git a/src/dataset_registration/__init__.py b/src/dataset_registration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/dataset_registration/factory.py b/src/dataset_registration/factory.py
new file mode 100644
index 0000000..4f2b317
--- /dev/null
+++ b/src/dataset_registration/factory.py
@@ -0,0 +1,10 @@
+import uuid
+from typing import Callable
+
+
+def get_func_to_fetch_list_registered_datasets(config: dict) -> Callable[..., dict[uuid.UUID, dict]]:
+ if config["DATA_REGISTRATION"] == "ckan-registry":
+ from dataset_registration.iati_registry_ckan import fetch_datasets_metadata
+ else:
+ from dataset_registration.iati_register_your_data import fetch_datasets_metadata
+ return fetch_datasets_metadata
diff --git a/src/dataset_registration/iati_register_your_data.py b/src/dataset_registration/iati_register_your_data.py
new file mode 100644
index 0000000..f60e213
--- /dev/null
+++ b/src/dataset_registration/iati_register_your_data.py
@@ -0,0 +1,7 @@
+import uuid
+
+import requests
+
+
+def fetch_datasets_metadata(context: dict, session: requests.Session) -> dict[uuid.UUID, dict]:
+ return {}
diff --git a/src/dataset_registration/iati_registry_ckan.py b/src/dataset_registration/iati_registry_ckan.py
new file mode 100644
index 0000000..ea5e103
--- /dev/null
+++ b/src/dataset_registration/iati_registry_ckan.py
@@ -0,0 +1,138 @@
+import random
+import uuid
+from logging import Logger
+from typing import Any
+
+import requests
+
+from utilities.http import http_get_json
+
+
+def fetch_datasets_metadata(context: dict, session: requests.Session) -> dict[uuid.UUID, dict]:
+
+ datasets_list_from_registry = fetch_datasets_metadata_from_iati_registry(context, session)
+
+ random.shuffle(datasets_list_from_registry)
+
+ cleaned_datasets_metadata = clean_datasets_metadata(context["logger"], datasets_list_from_registry)
+
+ datasets_metadata = convert_datasets_metadata(cleaned_datasets_metadata)
+
+ return datasets_metadata
+
+
+def fetch_datasets_metadata_from_iati_registry(context: dict, session: requests.Session) -> list[dict]:
+
+ api_url = context["DATA_REGISTRY_BASE_URL"]
+
+ number_of_datasets = http_get_json(session, api_url + "?rows=1")["result"]["count"]
+
+ if context["run_for_n_datasets"] is not None:
+ number_of_datasets = min(number_of_datasets, int(context["run_for_n_datasets"]))
+
+ batch_size = number_of_datasets if number_of_datasets < 1000 else 1000
+
+ datasets_metadata_downloaded = 0
+ datasets_metadata = []
+
+ while datasets_metadata_downloaded < number_of_datasets:
+ response = http_get_json(
+ session, "{}?rows={}&start={}".format(api_url, batch_size, str(datasets_metadata_downloaded))
+ )
+
+ datasets_metadata.extend(response["result"]["results"])
+
+ datasets_metadata_downloaded += len(response["result"]["results"])
+
+ return datasets_metadata
+
+
+def clean_datasets_metadata(logger: Logger, datasets_from_registry: list[dict[str, Any]]) -> list[dict[str, Any]]:
+ """Cleans the list of datasets from the IATI Registry
+
+ The data from the Registry should never be missing the values below, so if it is, we log an error
+ and skip that item. This allows for tidy conversion using dictionary comprehension."""
+
+ cleaned_datasets_metadata = []
+
+ for dataset_metadata in datasets_from_registry:
+ if not ckan_dataset_is_valid(logger, dataset_metadata):
+ continue
+
+ cleaned_datasets_metadata.append(dataset_metadata)
+
+ return cleaned_datasets_metadata
+
+
+def ckan_dataset_is_valid(logger: Logger, registry_dataset: dict[str, Any]) -> bool:
+ valid = True
+
+ for field in ["id", "name", "organization"]:
+ if not ckan_field_is_valid(registry_dataset[field]):
+ logger.error(
+ "dataset id: {} - Dataset with empty {} received from IATI Registry. "
+ "Skipping.".format(registry_dataset["id"], field)
+ )
+ valid = False
+
+ if ckan_field_is_valid(registry_dataset["organization"]) and not ckan_field_is_valid(
+ registry_dataset["organization"]["id"]
+ ): # noqa: E129
+ logger.error(
+ "dataset id: {} - Dataset with empty organisation.id received from "
+ "IATI Registry. Skipping.".format(registry_dataset["id"])
+ )
+ valid = False
+
+ if ckan_field_is_valid(registry_dataset["organization"]) and not ckan_field_is_valid(
+ registry_dataset["organization"]["name"]
+ ): # noqa: E129
+ logger.error(
+ "dataset id: {} - Dataset with empty organisation.name received from "
+ "IATI Registry. Skipping.".format(registry_dataset["id"])
+ )
+ valid = False
+
+ if (
+ not ckan_field_is_valid(registry_dataset["extras"])
+ or len([x for x in registry_dataset["extras"] if x["key"] == "filetype"]) == 0
+ ): # noqa: E129
+ logger.warning(
+ "dataset id: {} - Dataset with no filetype specified "
+ "received from IATI Registry. Skipping.".format(registry_dataset["id"])
+ )
+ valid = False
+
+ return valid
+
+
+def ckan_field_is_valid(value: Any) -> bool:
+ return value is not None and value != "None" and value != ""
+
+
+def convert_datasets_metadata(datasets_from_registry: list[dict]) -> dict[uuid.UUID, dict]:
+
+ registered_datasets = {
+ uuid.UUID(k["id"]): {
+ "id": uuid.UUID(k["id"]),
+ "name": k["name"],
+ "publisher_id": uuid.UUID(k["organization"]["id"]),
+ "publisher_name": k["organization"]["name"],
+ "source_url": get_source_url(k),
+ "type": list(filter(lambda x: x["key"] == "filetype", k["extras"]))[0]["value"],
+ }
+ for k in datasets_from_registry
+ }
+
+ return registered_datasets
+
+
+def get_source_url(ckan_dataset: dict) -> str:
+ if (
+ "resources" in ckan_dataset
+ and isinstance(ckan_dataset["resources"], list)
+ and len(ckan_dataset["resources"]) > 0
+ and "url" in ckan_dataset["resources"][0]
+ ): # noqa: E129
+ return ckan_dataset["resources"][0].get("url", "")
+ return ""
diff --git a/src/dataset_registration/registration_services.py b/src/dataset_registration/registration_services.py
new file mode 100644
index 0000000..15ec248
--- /dev/null
+++ b/src/dataset_registration/registration_services.py
@@ -0,0 +1,15 @@
+import uuid
+
+import requests
+
+from dataset_registration.factory import get_func_to_fetch_list_registered_datasets
+
+
+def get_registered_datasets(context: dict) -> dict[uuid.UUID, dict]:
+ session = requests.Session()
+
+ fetch_datasets_metadata = get_func_to_fetch_list_registered_datasets(context)
+
+ datasets_metadata = fetch_datasets_metadata(context, session)
+
+ return datasets_metadata
diff --git a/src/iati_bulk_data_service.py b/src/iati_bulk_data_service.py
new file mode 100644
index 0000000..fc9d51e
--- /dev/null
+++ b/src/iati_bulk_data_service.py
@@ -0,0 +1,47 @@
+import argparse
+
+from bulk_data_service.checker import checker
+from bulk_data_service.zipper import zipper
+from config.config import get_config
+from utilities.azure import create_azure_blob_containers
+from utilities.db import apply_db_migrations
+from utilities.logging import initialise_logging
+
+
+def main(args: argparse.Namespace):
+
+ config = get_config()
+
+ logger = initialise_logging(config)
+
+ context = config | {"logger": logger, "single_run": args.single_run, "run_for_n_datasets": args.run_for_n_datasets}
+
+ apply_db_migrations(context)
+
+ create_azure_blob_containers(context)
+
+ if args.operation == "checker":
+ checker(context)
+ elif args.operation == "zipper":
+ zipper(context)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Refresh from IATI Registry")
+ parser.add_argument(
+ "--operation",
+ choices=["checker", "zipper"],
+ required=True,
+ help="Operation to run: checker, downloader",
+ )
+ parser.add_argument(
+ "--single-run",
+ action="store_true",
+ help="Perform a single run, then exit",
+ )
+ parser.add_argument(
+ "--run-for-n-datasets",
+ type=int,
+ help="Run on the first N datasets from registration service (useful for testing)",
+ )
+ main(parser.parse_args())
diff --git a/src/utilities/__init__.py b/src/utilities/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/utilities/azure.py b/src/utilities/azure.py
new file mode 100644
index 0000000..2d111db
--- /dev/null
+++ b/src/utilities/azure.py
@@ -0,0 +1,100 @@
+from typing import Any
+
+import azure
+from azure.storage.blob import BlobServiceClient, ContentSettings
+
+
+def azure_download_blob(az_blob_service: BlobServiceClient, container_name: str, blob_name: str, filename: str):
+
+ blob_client = az_blob_service.get_blob_client(container_name, blob_name)
+
+ with open(file=filename, mode="wb") as xml_output:
+ download_stream = blob_client.download_blob()
+ xml_output.write(download_stream.readall())
+
+ blob_client.close()
+
+
+def azure_upload_to_blob(
+ az_blob_service: BlobServiceClient, container_name: str, blob_name: str, content: Any, content_type: str
+):
+
+ blob_client = az_blob_service.get_blob_client(container_name, blob_name)
+
+ content_settings = ContentSettings(content_type=content_type)
+
+ if content_type == "application/xml":
+ content_settings.content_encoding = "UTF-8"
+
+ blob_client.upload_blob(content, overwrite=True, content_settings=content_settings)
+
+
+def create_azure_blob_containers(context: dict):
+ blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
+
+ containers = blob_service.list_containers()
+ container_names = [c.name for c in containers]
+
+ try:
+ if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"] not in container_names:
+ blob_service.create_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"])
+ if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"] not in container_names:
+ blob_service.create_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"])
+ except Exception as e:
+ context["logger"].info("Could not create Azure blob storage containers: {}".format(e))
+ raise e
+ finally:
+ blob_service.close()
+
+
+def delete_azure_blob_containers(context: dict):
+ blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
+
+ containers = blob_service.list_containers()
+ container_names = [c.name for c in containers]
+
+ try:
+ if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"] in container_names:
+ blob_service.delete_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"])
+ if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"] in container_names:
+ blob_service.delete_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"])
+ except Exception as e:
+ context["logger"].info("Could not delete Azure blob storage container: {}".format(e))
+ raise e
+ finally:
+ blob_service.close()
+
+
+def delete_azure_iati_blob(context: dict, blob_service_client: BlobServiceClient, dataset: dict, iati_blob_type: str):
+
+ container_name = get_azure_container_name(context, iati_blob_type)
+
+ blob_name = get_azure_blob_name(dataset, iati_blob_type)
+
+ try:
+ blob_client = blob_service_client.get_blob_client(container_name, blob_name)
+
+ blob_client.delete_blob()
+ except azure.core.exceptions.ResourceNotFoundError as e:
+ context["logger"].warning(
+ "dataset id: {} - Problem deleting blob that was "
+ "expected to exist: {}".format(dataset["id"], e).replace("\n", "")
+ )
+ finally:
+ blob_client.close()
+
+
+def get_azure_container_name(context: dict, iati_blob_type: str) -> str:
+ return context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_" + iati_blob_type.upper()]
+
+
+def get_azure_blob_name(dataset: dict, iati_blob_type: str) -> str:
+ return "{}/{}.{}".format(dataset["publisher_name"], dataset["name"], iati_blob_type)
+
+
+def get_azure_blob_public_url(context: dict, dataset: dict, iati_blob_type: str) -> str:
+ return "{}/{}/{}".format(
+ context["BLOB_STORAGE_BASE_PUBLIC_URL"],
+ context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_" + iati_blob_type.upper()],
+ get_azure_blob_name(dataset, iati_blob_type),
+ )
diff --git a/src/utilities/db.py b/src/utilities/db.py
new file mode 100644
index 0000000..6e2d55a
--- /dev/null
+++ b/src/utilities/db.py
@@ -0,0 +1,109 @@
+import uuid
+from typing import Any
+
+import psycopg
+from yoyo import get_backend, read_migrations # type: ignore
+
+
+def apply_db_migrations(context: dict):
+
+ backend = get_backend(
+ "postgresql+psycopg://{}:{}@{}:{}/{}".format(
+ context["DB_USER"], context["DB_PASS"], context["DB_HOST"], context["DB_PORT"], context["DB_NAME"]
+ )
+ )
+
+ migrations = read_migrations("db-migrations")
+
+ with backend.lock():
+
+ # Apply any outstanding migrations
+ backend.apply_migrations(backend.to_apply(migrations))
+
+
+def get_db_connection(context: dict) -> psycopg.Connection:
+ connection = psycopg.connect(
+ dbname=context["DB_NAME"],
+ user=context["DB_USER"],
+ password=context["DB_PASS"],
+ host=context["DB_HOST"],
+ port=context["DB_PORT"],
+ sslmode="prefer" if context["DB_SSL_MODE"] is None else context["DB_SSL_MODE"],
+ connect_timeout=context["DB_CONNECTION_TIMEOUT"],
+ )
+ return connection
+
+
+def get_datasets_in_bds(context: dict) -> dict[uuid.UUID, dict]:
+
+ connection = get_db_connection(context)
+ cursor = connection.cursor(row_factory=psycopg.rows.dict_row)
+ cursor.execute("""SELECT * FROM iati_datasets""")
+ results_as_list = cursor.fetchall()
+ cursor.close()
+
+ results = {result["id"]: result for result in results_as_list}
+
+ return results
+
+
+def insert_or_update_dataset(connection: psycopg.Connection, data):
+ columns = ", ".join([k for k in data])
+ placeholders = ", ".join(["%({})s".format(k) for k in data])
+
+ add_sql = """INSERT INTO iati_datasets ({})
+ VALUES ({})
+ ON CONFLICT (id) DO
+ UPDATE SET
+ publisher_id = %(publisher_id)s,
+ publisher_name = %(publisher_name)s,
+ type = %(type)s,
+ source_url = %(source_url)s,
+ hash = %(hash)s,
+ hash_excluding_generated_timestamp = %(hash_excluding_generated_timestamp)s,
+ last_update_check = %(last_update_check)s,
+ last_head_attempt = %(last_head_attempt)s,
+ last_head_http_status = %(last_head_http_status)s,
+ head_error_message = %(head_error_message)s,
+ last_download_attempt = %(last_download_attempt)s,
+ last_download_http_status = %(last_download_http_status)s,
+ last_successful_download = %(last_successful_download)s,
+ last_verified_on_server = %(last_verified_on_server)s,
+ download_error_message = %(download_error_message)s,
+ content_modified = %(content_modified)s,
+ content_modified_excluding_generated_timestamp =
+ %(content_modified_excluding_generated_timestamp)s,
+ server_header_last_modified = %(server_header_last_modified)s,
+ server_header_etag = %(server_header_etag)s
+ WHERE
+ iati_datasets.id = %(id)s
+ """.format(
+ columns, placeholders
+ )
+ cursor = connection.cursor()
+ cursor.execute(add_sql, data) # type: ignore
+ cursor.close()
+ connection.commit()
+
+
+def remove_dataset_from_db(connection: psycopg.Connection, dataset_id):
+ add_sql = """DELETE FROM iati_datasets WHERE id = %(dataset_id)s"""
+ cursor = connection.cursor()
+ cursor.execute(add_sql, {"dataset_id": dataset_id})
+ cursor.close()
+ connection.commit()
+
+
+def execute_scalar_db_query(context: dict, sql: str) -> Any:
+ connection = get_db_connection(context)
+ value = execute_scalar_db_query_with_conn(connection, sql)
+ connection.close()
+ return value
+
+
+def execute_scalar_db_query_with_conn(connection: psycopg.Connection, sql: str) -> Any:
+ cursor = connection.cursor()
+ row = cursor.execute(sql).fetchone()
+ value = row[0] if row is not None else -1
+ cursor.close()
+ return value
diff --git a/src/utilities/http.py b/src/utilities/http.py
new file mode 100644
index 0000000..0f4b2e2
--- /dev/null
+++ b/src/utilities/http.py
@@ -0,0 +1,66 @@
+from typing import Any
+
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util import Retry
+
+
+def get_requests_session() -> requests.Session:
+ session = requests.Session()
+ session.headers.update({"User-Agent": "IATI Bulk Data Service 0.1"})
+ retries = Retry(total=2, backoff_factor=0.1)
+ session.mount("http://", HTTPAdapter(max_retries=retries))
+ session.mount("https://", HTTPAdapter(max_retries=retries))
+ return session
+
+
+def http_get_json(session: requests.Session, url: str, timeout: int = 30, exception_on_non_200: bool = True) -> Any:
+
+ response = session.get(url=url, timeout=timeout)
+
+ if exception_on_non_200 and response.status_code != 200:
+ raise RuntimeError(
+ "HTTP status code {} and reason {} when fetching {}".format(response.status_code, response.reason, url)
+ )
+
+ return response.json()
+
+
+def http_head_dataset(session: requests.Session, url: str, timeout: int = 10, retries: int = 2) -> requests.Response:
+
+ response = session.head(url=url, timeout=timeout, allow_redirects=True)
+
+ if response.status_code != 200:
+ raise RuntimeError(
+ {
+ "message": "HEAD request failed with non-200 status",
+ "url": response.url,
+ "http_method": "HEAD",
+ "http_status_code": response.status_code,
+ "http_reason": response.reason,
+ "http_headers": response.headers,
+ }
+ )
+
+ return response
+
+
+def http_download_dataset(
+ session: requests.Session, url: str, timeout: int = 25, retries: int = 2
+) -> requests.Response:
+
+ response = session.get(url=url, timeout=timeout, allow_redirects=True)
+
+ if response.status_code != 200:
+ raise RuntimeError(
+ {
+ "message": "HTTP GET request failed with non-200 status",
+ "url": response.url,
+ "http_method": "GET",
+ "http_status_code": response.status_code,
+ "http_reason": response.reason,
+ "http_headers": response.headers,
+ }
+ )
+
+ return response
diff --git a/src/utilities/logging.py b/src/utilities/logging.py
new file mode 100644
index 0000000..095af4f
--- /dev/null
+++ b/src/utilities/logging.py
@@ -0,0 +1,28 @@
+import logging
+import sys
+import time
+
+
+def initialise_logging(config: dict) -> logging.Logger:
+
+ bds_logger = logging.getLogger("bds")
+
+ bds_logger.setLevel(logging.DEBUG)
+
+ formatter = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+ formatter.default_time_format = "%Y-%m-%dT%H:%M:%S"
+ formatter.default_msec_format = "%s,%03dZ"
+
+ formatter.converter = time.gmtime
+
+ handler_stdout = logging.StreamHandler(sys.stdout)
+ handler_stdout.setFormatter(formatter)
+
+ bds_logger.addHandler(handler_stdout)
+
+ if config["LOGFILE"] != "":
+ handler_file = logging.FileHandler(config["LOGFILE"])
+ handler_file.setFormatter(formatter)
+ bds_logger.addHandler(handler_file)
+
+ return bds_logger
diff --git a/src/utilities/misc.py b/src/utilities/misc.py
new file mode 100644
index 0000000..b9a7642
--- /dev/null
+++ b/src/utilities/misc.py
@@ -0,0 +1,49 @@
+import datetime
+import hashlib
+import io
+import re
+import zipfile
+
+
+def get_hash(content: str) -> str:
+ hasher = hashlib.sha1()
+ hasher = hashlib.sha1()
+ hasher.update(content.encode("utf-8"))
+ return hasher.hexdigest()
+
+
+def get_hash_excluding_generated_timestamp(content: str) -> str:
+ hasher = hashlib.sha1()
+ content_to_hash = re.sub(r'generated-datetime="[^"]+"', "", content)
+
+ hasher = hashlib.sha1()
+ hasher.update(content_to_hash.encode("utf-8"))
+ return hasher.hexdigest()
+
+
+def get_timestamp(isodate: str = "") -> datetime.datetime:
+ if isodate != "":
+ return datetime.datetime.fromisoformat(isodate).astimezone()
+ else:
+ return datetime.datetime.now(tz=datetime.timezone.utc)
+
+
+def get_timestamp_as_str(isodate: str = "") -> str:
+ if isodate != "":
+ return datetime.datetime.fromisoformat(isodate).astimezone().isoformat()
+ else:
+ return datetime.datetime.now(tz=datetime.timezone.utc).isoformat()
+
+
+def set_timestamp_tz_utc(date: datetime.datetime) -> datetime.datetime:
+ return date.replace(tzinfo=datetime.timezone.utc)
+
+
+def zip_data_as_single_file(filename: str, data: str) -> bytes:
+
+ zip_buffer = io.BytesIO()
+
+ with zipfile.ZipFile(zip_buffer, "w") as xml_zipped:
+ xml_zipped.writestr(filename, data)
+
+ return zip_buffer.getvalue()
diff --git a/src/utilities/prometheus.py b/src/utilities/prometheus.py
new file mode 100644
index 0000000..6577e39
--- /dev/null
+++ b/src/utilities/prometheus.py
@@ -0,0 +1,74 @@
+from prometheus_client import Gauge, start_http_server
+
+from utilities.db import execute_scalar_db_query_with_conn, get_db_connection
+
+
+def get_metrics_definitions(context: dict) -> list:
+ metrics_defs = [
+ ("total_number_of_datasets", "The total number of datasets"),
+ ("datasets_with_download", "The number of datasets with a last good download"),
+ ("datasets_added", "The number of datasets removed during last update"),
+ ("datasets_unregistered", "The number of datasets unregistered and so removed during last run"),
+ ("datasets_expired", "The number of datasets that expired in last run"),
+ (
+ "datasets_head_request_non_200",
+ "The number of HEAD requests that returned non-200 status in the last run",
+ ),
+ (
+ "datasets_downloads_non_200",
+ "The number of download attempts that returned non-200 status in the last run",
+ ),
+ (
+ "checker_run_duration",
+ "The time taken by the last run of the checker (seconds)",
+ ),
+ (
+ "number_crashes",
+ "The number of crashes since app restart",
+ ),
+ ]
+
+ return metrics_defs
+
+
+def initialise_prometheus_client(context: dict) -> dict:
+
+ metrics = {}
+
+ for metric in get_metrics_definitions(context):
+ metrics[metric[0]] = Gauge(metric[0], metric[1])
+
+ metrics["number_crashes"].set(0)
+
+ context["prom_metrics"] = metrics
+
+ start_http_server(9090)
+
+ return context
+
+
+def update_metrics_from_db(context: dict) -> dict:
+ metrics_and_their_sql = [
+ (
+ "datasets_with_download",
+ ("SELECT COUNT(id) FROM iati_datasets WHERE " "last_successful_download IS NOT NULL"),
+ ),
+ (
+ "datasets_head_request_non_200",
+ ("SELECT COUNT(id) FROM iati_datasets WHERE " "last_head_http_status != 200"),
+ ),
+ (
+ "datasets_downloads_non_200",
+ ("SELECT COUNT(id) FROM iati_datasets WHERE " "last_download_http_status != 200"),
+ ),
+ ]
+
+ db_conn = get_db_connection(context)
+
+ for metric_from_db in metrics_and_their_sql:
+ metric = execute_scalar_db_query_with_conn(db_conn, metric_from_db[1])
+ context["prom_metrics"][metric_from_db[0]].set(metric)
+
+ db_conn.close()
+
+ return context
diff --git a/tests-local-environment/.env b/tests-local-environment/.env
new file mode 100644
index 0000000..11a82eb
--- /dev/null
+++ b/tests-local-environment/.env
@@ -0,0 +1,31 @@
+DATA_REGISTRATION=ckan-registry
+DATA_REGISTRY_BASE_URL=http://localhost:3000/registration/datasets-03
+
+BLOB_STORAGE_BASE_PUBLIC_URL=http://127.0.0.1:10000/devstoreaccount1
+
+NUMBER_DOWNLOADER_THREADS=25
+
+FORCE_REDOWNLOAD_AFTER_HOURS=24
+
+REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
+
+# Logs directory
+LOGFILE=
+
+ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
+
+DB_NAME=bulk_data_service_db
+DB_USER=bds
+
+# Setup for test environment DB
+DB_PASS=pass
+DB_HOST=localhost
+DB_PORT=5275
+DB_SSL_MODE=disable
+DB_CONNECTION_TIMEOUT=30
+
+# Azurite Emulator (run from docker compose)
+AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:11000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:11001/devstoreaccount1;TableEndpoint=http://127.0.0.1:11002/devstoreaccount1;
+
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip
diff --git a/tests-local-environment/docker-compose.yml b/tests-local-environment/docker-compose.yml
new file mode 100644
index 0000000..408ead3
--- /dev/null
+++ b/tests-local-environment/docker-compose.yml
@@ -0,0 +1,64 @@
+
+services:
+
+ iati-bulk-data-service-azurite-tests:
+ image: mcr.microsoft.com/azure-storage/azurite
+ ports:
+ - 11000:11000
+ - 11001:11001
+ - 11002:11002
+ command: azurite --blobHost 0.0.0.0 --blobPort 11000 --queueHost 0.0.0.0 --queuePort 11001 --tableHost 0.0.0.0 --tablePort 11002 --location /data --loose --skipApiVersionCheck
+ tmpfs:
+ - /data
+ healthcheck:
+ test: nc 127.0.0.1 11000 -z
+ interval: 5s
+ retries: 1
+
+ iati-bulk-data-service-postgres-tests:
+ image: postgres:15
+ ports:
+ - "${DB_PORT}:${DB_PORT}"
+ environment:
+ - POSTGRES_DB=${DB_NAME}
+ - POSTGRES_USER=${DB_USER}
+ - POSTGRES_PASSWORD=${DB_PASS}
+ - PGPORT=${DB_PORT}
+ - PGDATA=/var/lib/postgresql/data/pgdata
+ tmpfs:
+ - /var/lib/postgresql/data
+ healthcheck:
+ test: ["CMD-SHELL", "pg_isready -U bds -d bulk_data_service_db"]
+ interval: 5s
+ timeout: 5s
+ retries: 5
+
+ mock-registration-and-data-service:
+ image: mockoon/cli:latest
+ ports:
+ - 3000:3000
+ volumes:
+ - type: bind
+ source: mockoon-registration-and-data-server-config.json
+ target: /server-config.json
+ - type: bind
+ source: ../tests/artifacts
+ target: /tests/artifacts
+ read_only: true
+ command: ["--data", "server-config.json", "-p", "3000"]
+ healthcheck:
+ test: curl -s -o /dev/null 'http://localhost:3000/registration/datasets-01'
+ interval: 10s
+ timeout: 10s
+ retries: 1
+
+ all-services-running:
+ image: alpine
+ entrypoint: ["tail", "-f", "/dev/null"]
+ depends_on:
+ iati-bulk-data-service-azurite-tests:
+ condition: service_healthy
+ iati-bulk-data-service-postgres-tests:
+ condition: service_healthy
+ mock-registration-and-data-service:
+ condition: service_healthy
diff --git a/tests-local-environment/mockoon-registration-and-data-server-config.json b/tests-local-environment/mockoon-registration-and-data-server-config.json
new file mode 100644
index 0000000..f3f2c39
--- /dev/null
+++ b/tests-local-environment/mockoon-registration-and-data-server-config.json
@@ -0,0 +1,494 @@
+{
+ "uuid": "f2cf8a6c-fb05-4fd1-b810-b7e675f4fa9b",
+ "lastMigration": 32,
+ "name": "Mock registration and data servers",
+ "endpointPrefix": "",
+ "latency": 0,
+ "port": 3001,
+ "hostname": "",
+ "folders": [],
+ "routes": [
+ {
+ "uuid": "4313b3fa-170d-41bc-bc73-1331b6fe54e9",
+ "type": "http",
+ "documentation": "",
+ "method": "get",
+ "endpoint": "registration/datasets-01",
+ "responses": [
+ {
+ "uuid": "41f6e285-5568-4bc3-ac99-04fe53e03491",
+ "body": "{}",
+ "latency": 0,
+ "statusCode": 200,
+ "label": "",
+ "headers": [],
+ "bodyType": "FILE",
+ "filePath": "tests/artifacts/ckan-registry-datasets-01-1-dataset.json",
+ "databucketID": "",
+ "sendFileAsBody": true,
+ "rules": [],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": true,
+ "crudKey": "id",
+ "callbacks": []
+ }
+ ],
+ "responseMode": null
+ },
+ {
+ "uuid": "fe5038e8-f3a5-43d2-ab12-3fc2c72b2c11",
+ "type": "http",
+ "documentation": "",
+ "method": "get",
+ "endpoint": "data/test_foundation_a-dataset-001.xml",
+ "responses": [
+ {
+ "uuid": "cf576bd8-d758-49dc-a8c2-c3399bf816d7",
+ "body": "{}",
+ "latency": 0,
+ "statusCode": 200,
+ "label": "",
+ "headers": [],
+ "bodyType": "FILE",
+ "filePath": "tests/artifacts/iati-xml-files/test_foundation_a-dataset-001.xml",
+ "databucketID": "",
+ "sendFileAsBody": true,
+ "rules": [
+ {
+ "target": "body",
+ "modifier": "",
+ "value": "",
+ "invert": false,
+ "operator": "equals"
+ }
+ ],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": true,
+ "crudKey": "id",
+ "callbacks": []
+ }
+ ],
+ "responseMode": null
+ },
+ {
+ "uuid": "af641cc0-d99f-4dd2-b4f6-d1f6335bc774",
+ "type": "http",
+ "documentation": "",
+ "method": "get",
+ "endpoint": "registration/datasets-02",
+ "responses": [
+ {
+ "uuid": "68bdad9f-7236-47d9-bb4c-f49ee2ca97f8",
+ "body": "{}",
+ "latency": 0,
+ "statusCode": 200,
+ "label": "",
+ "headers": [],
+ "bodyType": "FILE",
+ "filePath": "tests/artifacts/ckan-registry-datasets-02-2-datasets.json",
+ "databucketID": "",
+ "sendFileAsBody": true,
+ "rules": [],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": true,
+ "crudKey": "id",
+ "callbacks": []
+ }
+ ],
+ "responseMode": null
+ },
+ {
+ "uuid": "7beddae4-23a1-43a6-9507-98627f39e170",
+ "type": "http",
+ "documentation": "",
+ "method": "get",
+ "endpoint": "data/test_foundation_b-dataset-001.xml",
+ "responses": [
+ {
+ "uuid": "81e7de85-adae-45c8-a338-5294d2701aef",
+ "body": "{}",
+ "latency": 0,
+ "statusCode": 200,
+ "label": "",
+ "headers": [],
+ "bodyType": "FILE",
+ "filePath": "tests/artifacts/iati-xml-files/test_foundation_b-dataset-001.xml",
+ "databucketID": "",
+ "sendFileAsBody": true,
+ "rules": [
+ {
+ "target": "body",
+ "modifier": "",
+ "value": "",
+ "invert": false,
+ "operator": "equals"
+ }
+ ],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": true,
+ "crudKey": "id",
+ "callbacks": []
+ }
+ ],
+ "responseMode": null
+ },
+ {
+ "uuid": "698df016-9108-429d-b48a-da8b4e687b20",
+ "type": "http",
+ "documentation": "",
+ "method": "get",
+ "endpoint": "registration/datasets-03",
+ "responses": [
+ {
+ "uuid": "6b34ed1d-c059-4ac2-be98-2f09629433f4",
+ "body": "{}",
+ "latency": 0,
+ "statusCode": 200,
+ "label": "",
+ "headers": [],
+ "bodyType": "FILE",
+ "filePath": "tests/artifacts/ckan-registry-datasets-03-1-dataset-404.json",
+ "databucketID": "",
+ "sendFileAsBody": true,
+ "rules": [],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": true,
+ "crudKey": "id",
+ "callbacks": []
+ }
+ ],
+ "responseMode": null
+ },
+ {
+ "uuid": "304b9dd4-ae2d-4bc2-be2f-42bf2bb89b21",
+ "type": "http",
+ "documentation": "",
+ "method": "get",
+ "endpoint": "registration/datasets-04",
+ "responses": [
+ {
+ "uuid": "4ba14f3d-3161-4ef6-a745-30e0e90187d9",
+ "body": "{}",
+ "latency": 0,
+ "statusCode": 200,
+ "label": "",
+ "headers": [],
+ "bodyType": "FILE",
+ "filePath": "tests/artifacts/ckan-registry-datasets-04-2-datasets-1-404.json",
+ "databucketID": "",
+ "sendFileAsBody": true,
+ "rules": [],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": true,
+ "crudKey": "id",
+ "callbacks": []
+ }
+ ],
+ "responseMode": null
+ },
+ {
+ "uuid": "42601386-dc87-4c1b-adee-7a4f16f6e31b",
+ "type": "http",
+ "documentation": "",
+ "method": "get",
+ "endpoint": "error-response/:code",
+ "responses": [
+ {
+ "uuid": "d40012e1-508d-4c29-8105-3a4bfa7fb9b1",
+ "body": "{}",
+ "latency": 0,
+ "statusCode": 200,
+ "label": "",
+ "headers": [],
+ "bodyType": "INLINE",
+ "filePath": "",
+ "databucketID": "",
+ "sendFileAsBody": true,
+ "rules": [
+ {
+ "target": "params",
+ "modifier": "code",
+ "value": "200",
+ "invert": false,
+ "operator": "equals"
+ }
+ ],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": true,
+ "crudKey": "id",
+ "callbacks": []
+ },
+ {
+ "uuid": "a726e010-c22d-45dd-86bb-eed1419a6876",
+ "body": "",
+ "latency": 0,
+ "statusCode": 400,
+ "label": "",
+ "headers": [],
+ "bodyType": "INLINE",
+ "filePath": "",
+ "databucketID": "",
+ "sendFileAsBody": false,
+ "rules": [
+ {
+ "target": "params",
+ "modifier": "code",
+ "value": "400",
+ "invert": false,
+ "operator": "equals"
+ }
+ ],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": false,
+ "crudKey": "id",
+ "callbacks": []
+ },
+ {
+ "uuid": "0c2546f6-3b03-4b94-9bef-54afb4a97f5c",
+ "body": "This page was not found.",
+ "latency": 0,
+ "statusCode": 404,
+ "label": "",
+ "headers": [],
+ "bodyType": "INLINE",
+ "filePath": "",
+ "databucketID": "",
+ "sendFileAsBody": false,
+ "rules": [
+ {
+ "target": "params",
+ "modifier": "code",
+ "value": "404",
+ "invert": false,
+ "operator": "equals"
+ }
+ ],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": false,
+ "crudKey": "id",
+ "callbacks": []
+ },
+ {
+ "uuid": "75af5d78-4178-405c-b605-cbcd9a9318dc",
+ "body": "Internal server error.",
+ "latency": 0,
+ "statusCode": 500,
+ "label": "",
+ "headers": [],
+ "bodyType": "INLINE",
+ "filePath": "",
+ "databucketID": "",
+ "sendFileAsBody": false,
+ "rules": [
+ {
+ "target": "params",
+ "modifier": "code",
+ "value": "500",
+ "invert": false,
+ "operator": "equals"
+ }
+ ],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": false,
+ "crudKey": "id",
+ "callbacks": []
+ }
+ ],
+ "responseMode": null
+ },
+ {
+ "uuid": "b81b26b1-62f6-473f-9815-bbb219c7667b",
+ "type": "http",
+ "documentation": "",
+ "method": "get",
+ "endpoint": "registration/datasets-05",
+ "responses": [
+ {
+ "uuid": "940422a8-6e56-462c-8601-e62316b419a4",
+ "body": "{}",
+ "latency": 0,
+ "statusCode": 200,
+ "label": "",
+ "headers": [],
+ "bodyType": "FILE",
+ "filePath": "tests/artifacts/ckan-registry-datasets-05-1-dataset-updated.json",
+ "databucketID": "",
+ "sendFileAsBody": true,
+ "rules": [],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": true,
+ "crudKey": "id",
+ "callbacks": []
+ }
+ ],
+ "responseMode": null
+ },
+ {
+ "uuid": "5f8025cd-7f9a-49cf-9660-b657f9b822ec",
+ "type": "http",
+ "documentation": "",
+ "method": "get",
+ "endpoint": "data/test_foundation_a-dataset-001-403.xml",
+ "responses": [
+ {
+ "uuid": "008d654b-9b35-46b5-b12a-a81dac62b4d9",
+ "body": "{}",
+ "latency": 0,
+ "statusCode": 403,
+ "label": "",
+ "headers": [],
+ "bodyType": "INLINE",
+ "filePath": "",
+ "databucketID": "",
+ "sendFileAsBody": false,
+ "rules": [],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": true,
+ "crudKey": "id",
+ "callbacks": []
+ }
+ ],
+ "responseMode": null
+ },
+ {
+ "uuid": "de92700b-4c9f-49c7-94e9-fa232a6b770a",
+ "type": "http",
+ "documentation": "",
+ "method": "get",
+ "endpoint": "registration/datasets-01-dataset-403",
+ "responses": [
+ {
+ "uuid": "f9d7daec-2b82-43d4-b618-ecb7d6578213",
+ "body": "{}",
+ "latency": 0,
+ "statusCode": 200,
+ "label": "",
+ "headers": [],
+ "bodyType": "FILE",
+ "filePath": "tests/artifacts/ckan-registry-datasets-01-1-dataset-403.json",
+ "databucketID": "",
+ "sendFileAsBody": true,
+ "rules": [],
+ "rulesOperator": "OR",
+ "disableTemplating": false,
+ "fallbackTo404": false,
+ "default": true,
+ "crudKey": "id",
+ "callbacks": []
+ }
+ ],
+ "responseMode": null
+ }
+ ],
+ "rootChildren": [
+ {
+ "type": "route",
+ "uuid": "4313b3fa-170d-41bc-bc73-1331b6fe54e9"
+ },
+ {
+ "type": "route",
+ "uuid": "de92700b-4c9f-49c7-94e9-fa232a6b770a"
+ },
+ {
+ "type": "route",
+ "uuid": "af641cc0-d99f-4dd2-b4f6-d1f6335bc774"
+ },
+ {
+ "type": "route",
+ "uuid": "698df016-9108-429d-b48a-da8b4e687b20"
+ },
+ {
+ "type": "route",
+ "uuid": "304b9dd4-ae2d-4bc2-be2f-42bf2bb89b21"
+ },
+ {
+ "type": "route",
+ "uuid": "b81b26b1-62f6-473f-9815-bbb219c7667b"
+ },
+ {
+ "type": "route",
+ "uuid": "fe5038e8-f3a5-43d2-ab12-3fc2c72b2c11"
+ },
+ {
+ "type": "route",
+ "uuid": "5f8025cd-7f9a-49cf-9660-b657f9b822ec"
+ },
+ {
+ "type": "route",
+ "uuid": "7beddae4-23a1-43a6-9507-98627f39e170"
+ },
+ {
+ "type": "route",
+ "uuid": "42601386-dc87-4c1b-adee-7a4f16f6e31b"
+ }
+ ],
+ "proxyMode": false,
+ "proxyHost": "",
+ "proxyRemovePrefix": false,
+ "tlsOptions": {
+ "enabled": false,
+ "type": "CERT",
+ "pfxPath": "",
+ "certPath": "",
+ "keyPath": "",
+ "caPath": "",
+ "passphrase": ""
+ },
+ "cors": true,
+ "headers": [
+ {
+ "key": "Content-Type",
+ "value": "application/json"
+ },
+ {
+ "key": "Access-Control-Allow-Origin",
+ "value": "*"
+ },
+ {
+ "key": "Access-Control-Allow-Methods",
+ "value": "GET,POST,PUT,PATCH,DELETE,HEAD,OPTIONS"
+ },
+ {
+ "key": "Access-Control-Allow-Headers",
+ "value": "Content-Type, Origin, Accept, Authorization, Content-Length, X-Requested-With"
+ }
+ ],
+ "proxyReqHeaders": [
+ {
+ "key": "",
+ "value": ""
+ }
+ ],
+ "proxyResHeaders": [
+ {
+ "key": "",
+ "value": ""
+ }
+ ],
+ "data": [],
+ "callbacks": []
+}
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/artifacts/ckan-registry-datasets-01-1-dataset-403.json b/tests/artifacts/ckan-registry-datasets-01-1-dataset-403.json
new file mode 100644
index 0000000..a9b0d4a
--- /dev/null
+++ b/tests/artifacts/ckan-registry-datasets-01-1-dataset-403.json
@@ -0,0 +1,109 @@
+{
+ "help": "https://iatiregistry.org/api/3/action/help_show?name=package_search",
+ "success": true,
+ "result": {
+ "count": 1,
+ "facets": {},
+ "results": [
+ {
+ "author": null,
+ "author_email": "publisher@email-here.com",
+ "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb",
+ "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159",
+ "isopen": true,
+ "license_id": "other-at",
+ "license_title": "Other (Attribution)",
+ "maintainer": null,
+ "maintainer_email": null,
+ "metadata_created": "2024-03-04T10:24:11.373108",
+ "metadata_modified": "2024-05-07T15:38:58.740018",
+ "name": "test_foundation_a-dataset-001",
+ "notes": "",
+ "num_resources": 1,
+ "num_tags": 0,
+ "organization": {
+ "id": "ea055d99-f7e9-456f-9f99-963e95493c1b",
+ "name": "test_foundation_a",
+ "title": "Test Foundation A",
+ "type": "organization",
+ "description": "",
+ "image_url": "",
+ "created": "2020-02-24T20:56:01.763851",
+ "is_organization": true,
+ "approval_status": "approved",
+ "state": "active"
+ },
+ "owner_org": "5d04f169-c702-45fe-8162-da7834859d86",
+ "private": false,
+ "state": "active",
+ "title": "040324",
+ "type": "dataset",
+ "url": null,
+ "version": null,
+ "extras": [
+ {
+ "key": "activity_count",
+ "value": "10"
+ },
+ {
+ "key": "country",
+ "value": "GB"
+ },
+ {
+ "key": "data_updated",
+ "value": "2024-03-01 14:24:09"
+ },
+ {
+ "key": "filetype",
+ "value": "activity"
+ },
+ {
+ "key": "iati_version",
+ "value": "2.03"
+ },
+ {
+ "key": "language",
+ "value": ""
+ },
+ {
+ "key": "secondary_publisher",
+ "value": ""
+ },
+ {
+ "key": "validation_status",
+ "value": "Not Found"
+ }
+ ],
+ "resources": [
+ {
+ "cache_last_updated": null,
+ "cache_url": null,
+ "created": "2024-05-07T15:38:57.312249",
+ "description": null,
+ "format": "IATI-XML",
+ "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac",
+ "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe",
+ "last_modified": null,
+ "metadata_modified": "2024-05-07T15:38:58.757860",
+ "mimetype": "",
+ "mimetype_inner": null,
+ "name": null,
+ "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9",
+ "position": 0,
+ "resource_type": null,
+ "size": 399382,
+ "state": "active",
+ "url": "http://localhost:3000/data/test_foundation_a-dataset-001-403.xml",
+ "url_type": null
+ }
+ ],
+ "tags": [],
+ "groups": [],
+ "relationships_as_subject": [],
+ "relationships_as_object": []
+ }
+ ],
+ "sort": "title_string asc",
+ "search_facets": {}
+ }
+}
\ No newline at end of file
diff --git a/tests/artifacts/ckan-registry-datasets-01-1-dataset.json b/tests/artifacts/ckan-registry-datasets-01-1-dataset.json
new file mode 100644
index 0000000..03fe39f
--- /dev/null
+++ b/tests/artifacts/ckan-registry-datasets-01-1-dataset.json
@@ -0,0 +1,109 @@
+{
+ "help": "https://iatiregistry.org/api/3/action/help_show?name=package_search",
+ "success": true,
+ "result": {
+ "count": 1,
+ "facets": {},
+ "results": [
+ {
+ "author": null,
+ "author_email": "publisher@email-here.com",
+ "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb",
+ "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159",
+ "isopen": true,
+ "license_id": "other-at",
+ "license_title": "Other (Attribution)",
+ "maintainer": null,
+ "maintainer_email": null,
+ "metadata_created": "2024-03-04T10:24:11.373108",
+ "metadata_modified": "2024-05-07T15:38:58.740018",
+ "name": "test_foundation_a-dataset-001",
+ "notes": "",
+ "num_resources": 1,
+ "num_tags": 0,
+ "organization": {
+ "id": "ea055d99-f7e9-456f-9f99-963e95493c1b",
+ "name": "test_foundation_a",
+ "title": "Test Foundation A",
+ "type": "organization",
+ "description": "",
+ "image_url": "",
+ "created": "2020-02-24T20:56:01.763851",
+ "is_organization": true,
+ "approval_status": "approved",
+ "state": "active"
+ },
+ "owner_org": "5d04f169-c702-45fe-8162-da7834859d86",
+ "private": false,
+ "state": "active",
+ "title": "040324",
+ "type": "dataset",
+ "url": null,
+ "version": null,
+ "extras": [
+ {
+ "key": "activity_count",
+ "value": "10"
+ },
+ {
+ "key": "country",
+ "value": "GB"
+ },
+ {
+ "key": "data_updated",
+ "value": "2024-03-01 14:24:09"
+ },
+ {
+ "key": "filetype",
+ "value": "activity"
+ },
+ {
+ "key": "iati_version",
+ "value": "2.03"
+ },
+ {
+ "key": "language",
+ "value": ""
+ },
+ {
+ "key": "secondary_publisher",
+ "value": ""
+ },
+ {
+ "key": "validation_status",
+ "value": "Not Found"
+ }
+ ],
+ "resources": [
+ {
+ "cache_last_updated": null,
+ "cache_url": null,
+ "created": "2024-05-07T15:38:57.312249",
+ "description": null,
+ "format": "IATI-XML",
+ "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac",
+ "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe",
+ "last_modified": null,
+ "metadata_modified": "2024-05-07T15:38:58.757860",
+ "mimetype": "",
+ "mimetype_inner": null,
+ "name": null,
+ "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9",
+ "position": 0,
+ "resource_type": null,
+ "size": 399382,
+ "state": "active",
+ "url": "http://localhost:3000/data/test_foundation_a-dataset-001.xml",
+ "url_type": null
+ }
+ ],
+ "tags": [],
+ "groups": [],
+ "relationships_as_subject": [],
+ "relationships_as_object": []
+ }
+ ],
+ "sort": "title_string asc",
+ "search_facets": {}
+ }
+}
\ No newline at end of file
diff --git a/tests/artifacts/ckan-registry-datasets-02-2-datasets.json b/tests/artifacts/ckan-registry-datasets-02-2-datasets.json
new file mode 100644
index 0000000..dde09ba
--- /dev/null
+++ b/tests/artifacts/ckan-registry-datasets-02-2-datasets.json
@@ -0,0 +1,207 @@
+{
+ "help": "https://iatiregistry.org/api/3/action/help_show?name=package_search",
+ "success": true,
+ "result": {
+ "count": 2,
+ "facets": {},
+ "results": [
+ {
+ "author": null,
+ "author_email": "publisher@email-here.com",
+ "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb",
+ "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159",
+ "isopen": true,
+ "license_id": "other-at",
+ "license_title": "Other (Attribution)",
+ "maintainer": null,
+ "maintainer_email": null,
+ "metadata_created": "2024-03-04T10:24:11.373108",
+ "metadata_modified": "2024-05-07T15:38:58.740018",
+ "name": "test_foundation_a-dataset-001",
+ "notes": "",
+ "num_resources": 1,
+ "num_tags": 0,
+ "organization": {
+ "id": "ea055d99-f7e9-456f-9f99-963e95493c1b",
+ "name": "test_foundation_a",
+ "title": "Test Foundation A",
+ "type": "organization",
+ "description": "",
+ "image_url": "",
+ "created": "2020-02-24T20:56:01.763851",
+ "is_organization": true,
+ "approval_status": "approved",
+ "state": "active"
+ },
+ "owner_org": "5d04f169-c702-45fe-8162-da7834859d86",
+ "private": false,
+ "state": "active",
+ "title": "040324",
+ "type": "dataset",
+ "url": null,
+ "version": null,
+ "extras": [
+ {
+ "key": "activity_count",
+ "value": "10"
+ },
+ {
+ "key": "country",
+ "value": "GB"
+ },
+ {
+ "key": "data_updated",
+ "value": "2024-03-01 14:24:09"
+ },
+ {
+ "key": "filetype",
+ "value": "activity"
+ },
+ {
+ "key": "iati_version",
+ "value": "2.03"
+ },
+ {
+ "key": "language",
+ "value": ""
+ },
+ {
+ "key": "secondary_publisher",
+ "value": ""
+ },
+ {
+ "key": "validation_status",
+ "value": "Not Found"
+ }
+ ],
+ "resources": [
+ {
+ "cache_last_updated": null,
+ "cache_url": null,
+ "created": "2024-05-07T15:38:57.312249",
+ "description": null,
+ "format": "IATI-XML",
+ "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac",
+ "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe",
+ "last_modified": null,
+ "metadata_modified": "2024-05-07T15:38:58.757860",
+ "mimetype": "",
+ "mimetype_inner": null,
+ "name": null,
+ "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9",
+ "position": 0,
+ "resource_type": null,
+ "size": 399382,
+ "state": "active",
+ "url": "http://localhost:3000/data/test_foundation_a-dataset-001.xml",
+ "url_type": null
+ }
+ ],
+ "tags": [],
+ "groups": [],
+ "relationships_as_subject": [],
+ "relationships_as_object": []
+ },
+
+ {
+ "author": null,
+ "author_email": "publisher@email-here.com",
+ "creator_user_id": "444659cc-998e-4e31-922b-0c6b04240c4c",
+ "id": "90f4282f-9ac5-4385-804d-1a377f5b57be",
+ "isopen": true,
+ "license_id": "other-at",
+ "license_title": "Other (Attribution)",
+ "maintainer": null,
+ "maintainer_email": null,
+ "metadata_created": "2024-03-04T10:24:11.373108",
+ "metadata_modified": "2024-05-07T15:38:58.740018",
+ "name": "test_foundation_b-dataset-001",
+ "notes": "",
+ "num_resources": 1,
+ "num_tags": 0,
+ "organization": {
+ "id": "373cd7dc-114b-4a91-a63f-cb2e25cd1249",
+ "name": "test_foundation_b",
+ "title": "Test Foundation B",
+ "type": "organization",
+ "description": "",
+ "image_url": "",
+ "created": "2020-02-24T20:56:01.763851",
+ "is_organization": true,
+ "approval_status": "approved",
+ "state": "active"
+ },
+ "owner_org": "c22c9225-8410-4dae-ab71-127faccc0afb",
+ "private": false,
+ "state": "active",
+ "title": "040324",
+ "type": "dataset",
+ "url": null,
+ "version": null,
+ "extras": [
+ {
+ "key": "activity_count",
+ "value": "10"
+ },
+ {
+ "key": "country",
+ "value": "GB"
+ },
+ {
+ "key": "data_updated",
+ "value": "2024-03-01 14:24:09"
+ },
+ {
+ "key": "filetype",
+ "value": "activity"
+ },
+ {
+ "key": "iati_version",
+ "value": "2.03"
+ },
+ {
+ "key": "language",
+ "value": ""
+ },
+ {
+ "key": "secondary_publisher",
+ "value": ""
+ },
+ {
+ "key": "validation_status",
+ "value": "Not Found"
+ }
+ ],
+ "resources": [
+ {
+ "cache_last_updated": null,
+ "cache_url": null,
+ "created": "2024-05-07T15:38:57.312249",
+ "description": null,
+ "format": "IATI-XML",
+ "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac",
+ "id": "570c9e2d-6a9f-429f-8347-05758a195c97",
+ "last_modified": null,
+ "metadata_modified": "2024-05-07T15:38:58.757860",
+ "mimetype": "",
+ "mimetype_inner": null,
+ "name": null,
+ "package_id": "4d8c94ed-af9a-4f8d-be00-2e692717b227",
+ "position": 0,
+ "resource_type": null,
+ "size": 399382,
+ "state": "active",
+ "url": "http://localhost:3000/data/test_foundation_b-dataset-001.xml",
+ "url_type": null
+ }
+ ],
+ "tags": [],
+ "groups": [],
+ "relationships_as_subject": [],
+ "relationships_as_object": []
+ }
+ ],
+ "sort": "title_string asc",
+ "search_facets": {}
+ }
+}
\ No newline at end of file
diff --git a/tests/artifacts/ckan-registry-datasets-03-1-dataset-404.json b/tests/artifacts/ckan-registry-datasets-03-1-dataset-404.json
new file mode 100644
index 0000000..6aa3aaa
--- /dev/null
+++ b/tests/artifacts/ckan-registry-datasets-03-1-dataset-404.json
@@ -0,0 +1,109 @@
+{
+ "help": "https://iatiregistry.org/api/3/action/help_show?name=package_search",
+ "success": true,
+ "result": {
+ "count": 1,
+ "facets": {},
+ "results": [
+ {
+ "author": null,
+ "author_email": "publisher@email-here.com",
+ "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb",
+ "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159",
+ "isopen": true,
+ "license_id": "other-at",
+ "license_title": "Other (Attribution)",
+ "maintainer": null,
+ "maintainer_email": null,
+ "metadata_created": "2024-03-04T10:24:11.373108",
+ "metadata_modified": "2024-05-07T15:38:58.740018",
+ "name": "test_foundation_a-dataset-001",
+ "notes": "",
+ "num_resources": 1,
+ "num_tags": 0,
+ "organization": {
+ "id": "ea055d99-f7e9-456f-9f99-963e95493c1b",
+ "name": "test_foundation_a",
+ "title": "Test Foundation A",
+ "type": "organization",
+ "description": "",
+ "image_url": "",
+ "created": "2020-02-24T20:56:01.763851",
+ "is_organization": true,
+ "approval_status": "approved",
+ "state": "active"
+ },
+ "owner_org": "5d04f169-c702-45fe-8162-da7834859d86",
+ "private": false,
+ "state": "active",
+ "title": "040324",
+ "type": "dataset",
+ "url": null,
+ "version": null,
+ "extras": [
+ {
+ "key": "activity_count",
+ "value": "10"
+ },
+ {
+ "key": "country",
+ "value": "GB"
+ },
+ {
+ "key": "data_updated",
+ "value": "2024-03-01 14:24:09"
+ },
+ {
+ "key": "filetype",
+ "value": "activity"
+ },
+ {
+ "key": "iati_version",
+ "value": "2.03"
+ },
+ {
+ "key": "language",
+ "value": ""
+ },
+ {
+ "key": "secondary_publisher",
+ "value": ""
+ },
+ {
+ "key": "validation_status",
+ "value": "Not Found"
+ }
+ ],
+ "resources": [
+ {
+ "cache_last_updated": null,
+ "cache_url": null,
+ "created": "2024-05-07T15:38:57.312249",
+ "description": null,
+ "format": "IATI-XML",
+ "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac",
+ "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe",
+ "last_modified": null,
+ "metadata_modified": "2024-05-07T15:38:58.757860",
+ "mimetype": "",
+ "mimetype_inner": null,
+ "name": null,
+ "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9",
+ "position": 0,
+ "resource_type": null,
+ "size": 399382,
+ "state": "active",
+ "url": "http://localhost:3000/data/test_foundation_a-dataset-404.xml",
+ "url_type": null
+ }
+ ],
+ "tags": [],
+ "groups": [],
+ "relationships_as_subject": [],
+ "relationships_as_object": []
+ }
+ ],
+ "sort": "title_string asc",
+ "search_facets": {}
+ }
+}
\ No newline at end of file
diff --git a/tests/artifacts/ckan-registry-datasets-04-2-datasets-1-404.json b/tests/artifacts/ckan-registry-datasets-04-2-datasets-1-404.json
new file mode 100644
index 0000000..54ff039
--- /dev/null
+++ b/tests/artifacts/ckan-registry-datasets-04-2-datasets-1-404.json
@@ -0,0 +1,207 @@
+{
+ "help": "https://iatiregistry.org/api/3/action/help_show?name=package_search",
+ "success": true,
+ "result": {
+ "count": 2,
+ "facets": {},
+ "results": [
+ {
+ "author": null,
+ "author_email": "publisher@email-here.com",
+ "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb",
+ "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159",
+ "isopen": true,
+ "license_id": "other-at",
+ "license_title": "Other (Attribution)",
+ "maintainer": null,
+ "maintainer_email": null,
+ "metadata_created": "2024-03-04T10:24:11.373108",
+ "metadata_modified": "2024-05-07T15:38:58.740018",
+ "name": "test_foundation_a-dataset-001",
+ "notes": "",
+ "num_resources": 1,
+ "num_tags": 0,
+ "organization": {
+ "id": "ea055d99-f7e9-456f-9f99-963e95493c1b",
+ "name": "test_foundation_a",
+ "title": "Test Foundation A",
+ "type": "organization",
+ "description": "",
+ "image_url": "",
+ "created": "2020-02-24T20:56:01.763851",
+ "is_organization": true,
+ "approval_status": "approved",
+ "state": "active"
+ },
+ "owner_org": "5d04f169-c702-45fe-8162-da7834859d86",
+ "private": false,
+ "state": "active",
+ "title": "040324",
+ "type": "dataset",
+ "url": null,
+ "version": null,
+ "extras": [
+ {
+ "key": "activity_count",
+ "value": "10"
+ },
+ {
+ "key": "country",
+ "value": "GB"
+ },
+ {
+ "key": "data_updated",
+ "value": "2024-03-01 14:24:09"
+ },
+ {
+ "key": "filetype",
+ "value": "activity"
+ },
+ {
+ "key": "iati_version",
+ "value": "2.03"
+ },
+ {
+ "key": "language",
+ "value": ""
+ },
+ {
+ "key": "secondary_publisher",
+ "value": ""
+ },
+ {
+ "key": "validation_status",
+ "value": "Not Found"
+ }
+ ],
+ "resources": [
+ {
+ "cache_last_updated": null,
+ "cache_url": null,
+ "created": "2024-05-07T15:38:57.312249",
+ "description": null,
+ "format": "IATI-XML",
+ "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac",
+ "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe",
+ "last_modified": null,
+ "metadata_modified": "2024-05-07T15:38:58.757860",
+ "mimetype": "",
+ "mimetype_inner": null,
+ "name": null,
+ "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9",
+ "position": 0,
+ "resource_type": null,
+ "size": 399382,
+ "state": "active",
+ "url": "http://localhost:3000/data/test_foundation_a-dataset-001.xml",
+ "url_type": null
+ }
+ ],
+ "tags": [],
+ "groups": [],
+ "relationships_as_subject": [],
+ "relationships_as_object": []
+ },
+
+ {
+ "author": null,
+ "author_email": "publisher@email-here.com",
+ "creator_user_id": "444659cc-998e-4e31-922b-0c6b04240c4c",
+ "id": "90f4282f-9ac5-4385-804d-1a377f5b57be",
+ "isopen": true,
+ "license_id": "other-at",
+ "license_title": "Other (Attribution)",
+ "maintainer": null,
+ "maintainer_email": null,
+ "metadata_created": "2024-03-04T10:24:11.373108",
+ "metadata_modified": "2024-05-07T15:38:58.740018",
+ "name": "test_foundation_b-dataset-001",
+ "notes": "",
+ "num_resources": 1,
+ "num_tags": 0,
+ "organization": {
+ "id": "373cd7dc-114b-4a91-a63f-cb2e25cd1249",
+ "name": "test_foundation_b",
+ "title": "Test Foundation B",
+ "type": "organization",
+ "description": "",
+ "image_url": "",
+ "created": "2020-02-24T20:56:01.763851",
+ "is_organization": true,
+ "approval_status": "approved",
+ "state": "active"
+ },
+ "owner_org": "c22c9225-8410-4dae-ab71-127faccc0afb",
+ "private": false,
+ "state": "active",
+ "title": "040324",
+ "type": "dataset",
+ "url": null,
+ "version": null,
+ "extras": [
+ {
+ "key": "activity_count",
+ "value": "10"
+ },
+ {
+ "key": "country",
+ "value": "GB"
+ },
+ {
+ "key": "data_updated",
+ "value": "2024-03-01 14:24:09"
+ },
+ {
+ "key": "filetype",
+ "value": "activity"
+ },
+ {
+ "key": "iati_version",
+ "value": "2.03"
+ },
+ {
+ "key": "language",
+ "value": ""
+ },
+ {
+ "key": "secondary_publisher",
+ "value": ""
+ },
+ {
+ "key": "validation_status",
+ "value": "Not Found"
+ }
+ ],
+ "resources": [
+ {
+ "cache_last_updated": null,
+ "cache_url": null,
+ "created": "2024-05-07T15:38:57.312249",
+ "description": null,
+ "format": "IATI-XML",
+ "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac",
+ "id": "570c9e2d-6a9f-429f-8347-05758a195c97",
+ "last_modified": null,
+ "metadata_modified": "2024-05-07T15:38:58.757860",
+ "mimetype": "",
+ "mimetype_inner": null,
+ "name": null,
+ "package_id": "4d8c94ed-af9a-4f8d-be00-2e692717b227",
+ "position": 0,
+ "resource_type": null,
+ "size": 399382,
+ "state": "active",
+ "url": "http://localhost:3000/data/test_foundation_b-dataset-404.xml",
+ "url_type": null
+ }
+ ],
+ "tags": [],
+ "groups": [],
+ "relationships_as_subject": [],
+ "relationships_as_object": []
+ }
+ ],
+ "sort": "title_string asc",
+ "search_facets": {}
+ }
+}
\ No newline at end of file
diff --git a/tests/artifacts/ckan-registry-datasets-05-1-dataset-updated.json b/tests/artifacts/ckan-registry-datasets-05-1-dataset-updated.json
new file mode 100644
index 0000000..8ecf741
--- /dev/null
+++ b/tests/artifacts/ckan-registry-datasets-05-1-dataset-updated.json
@@ -0,0 +1,109 @@
+{
+ "help": "https://iatiregistry.org/api/3/action/help_show?name=package_search",
+ "success": true,
+ "result": {
+ "count": 1,
+ "facets": {},
+ "results": [
+ {
+ "author": null,
+ "author_email": "publisher@email-here.com",
+ "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb",
+ "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159",
+ "isopen": true,
+ "license_id": "other-at",
+ "license_title": "Other (Attribution)",
+ "maintainer": null,
+ "maintainer_email": null,
+ "metadata_created": "2024-03-04T10:24:11.373108",
+ "metadata_modified": "2024-05-07T15:38:58.740018",
+ "name": "test_foundation_a-dataset-001",
+ "notes": "",
+ "num_resources": 1,
+ "num_tags": 0,
+ "organization": {
+ "id": "4f0f8498-20d2-4ca5-a20f-f441eedb1d4f",
+ "name": "test_org_a",
+ "title": "Test Foundation A",
+ "type": "organization",
+ "description": "",
+ "image_url": "",
+ "created": "2020-02-24T20:56:01.763851",
+ "is_organization": true,
+ "approval_status": "approved",
+ "state": "active"
+ },
+ "owner_org": "5d04f169-c702-45fe-8162-da7834859d86",
+ "private": false,
+ "state": "active",
+ "title": "040324",
+ "type": "dataset",
+ "url": null,
+ "version": null,
+ "extras": [
+ {
+ "key": "activity_count",
+ "value": "10"
+ },
+ {
+ "key": "country",
+ "value": "GB"
+ },
+ {
+ "key": "data_updated",
+ "value": "2024-03-01 14:24:09"
+ },
+ {
+ "key": "filetype",
+ "value": "organisation"
+ },
+ {
+ "key": "iati_version",
+ "value": "2.03"
+ },
+ {
+ "key": "language",
+ "value": ""
+ },
+ {
+ "key": "secondary_publisher",
+ "value": ""
+ },
+ {
+ "key": "validation_status",
+ "value": "Not Found"
+ }
+ ],
+ "resources": [
+ {
+ "cache_last_updated": null,
+ "cache_url": null,
+ "created": "2024-05-07T15:38:57.312249",
+ "description": null,
+ "format": "IATI-XML",
+ "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac",
+ "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe",
+ "last_modified": null,
+ "metadata_modified": "2024-05-07T15:38:58.757860",
+ "mimetype": "",
+ "mimetype_inner": null,
+ "name": null,
+ "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9",
+ "position": 0,
+ "resource_type": null,
+ "size": 399382,
+ "state": "active",
+ "url": "http://localhost:3000/not_found",
+ "url_type": null
+ }
+ ],
+ "tags": [],
+ "groups": [],
+ "relationships_as_subject": [],
+ "relationships_as_object": []
+ }
+ ],
+ "sort": "title_string asc",
+ "search_facets": {}
+ }
+}
\ No newline at end of file
diff --git a/tests/artifacts/config-files/env-file-1 b/tests/artifacts/config-files/env-file-1
new file mode 100644
index 0000000..f6f8d2b
--- /dev/null
+++ b/tests/artifacts/config-files/env-file-1
@@ -0,0 +1,33 @@
+DATA_REGISTRATION=ckan-registry
+DATA_REGISTRY_BASE_URL=http://localhost:3000/registration-service/
+
+BLOB_STORAGE_BASE_PUBLIC_URL=http://127.0.0.1:10000/devstoreaccount1
+
+NUMBER_DOWNLOADER_THREADS=1
+
+FORCE_REDOWNLOAD_AFTER_HOURS=24
+
+REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
+
+# Log file
+LOGFILE=
+
+ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
+
+DB_NAME=bulk_data_service_db
+DB_USER=bds
+
+# Local setup - values read by docker compose, and used by the app
+DB_PASS=pass
+DB_HOST=localhost
+DB_PORT=5255
+DB_SSL_MODE=disable
+DB_CONNECTION_TIMEOUT=30
+
+
+# Local Azurite Emulator
+AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;
+
+
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip
diff --git a/tests/artifacts/config-files/env-file-2 b/tests/artifacts/config-files/env-file-2
new file mode 100644
index 0000000..f010261
--- /dev/null
+++ b/tests/artifacts/config-files/env-file-2
@@ -0,0 +1,33 @@
+DATA_REGISTRATION=ckan-registry
+DATA_REGISTRY_BASE_URL=http://localhost:3000/registration-service/
+
+BLOB_STORAGE_BASE_PUBLIC_URL=http://127.0.0.1:10000/devstoreaccount1/
+
+NUMBER_DOWNLOADER_THREADS=1
+
+FORCE_REDOWNLOAD_AFTER_HOURS=24
+
+REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
+
+# Log file
+LOGFILE=
+
+ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
+
+DB_NAME=bulk_data_service_db
+DB_USER=bds
+
+# Local setup - values read by docker compose, and used by the app
+DB_PASS=pass
+DB_HOST=localhost
+DB_PORT=5255
+DB_SSL_MODE=disable
+DB_CONNECTION_TIMEOUT=30
+
+
+# Local Azurite Emulator
+AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;
+
+
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip
diff --git a/tests/artifacts/iati-xml-files/test_foundation_a-dataset-001.xml b/tests/artifacts/iati-xml-files/test_foundation_a-dataset-001.xml
new file mode 100644
index 0000000..9ef4590
--- /dev/null
+++ b/tests/artifacts/iati-xml-files/test_foundation_a-dataset-001.xml
@@ -0,0 +1,22 @@
+
+
+
IATI Dataset Index - Minimal (JSON)
+ + +