Skip to content

Commit

Permalink
Update with report site generator.
Browse files Browse the repository at this point in the history
  • Loading branch information
GilHoggarth committed Oct 24, 2023
1 parent e95b3de commit 53b37d6
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 4 deletions.
1 change: 1 addition & 0 deletions manage/airflow/dags/_common_.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class Config():
# Define the common parameters for running Docker tasks:
w3act_task_image = 'ukwa/python-w3act:2.1.5'
ukwa_task_image = 'ukwa/ukwa-manage:2.4.1'
ukwa_reports_image = 'ukwa/ukwa-reports:master'
hadoop_docker_image = 'ukwa/docker-hadoop:2.1.2'
postgres_image = 'postgres:9.6.2'
rclone_image = 'rclone/rclone:1.62'
Expand Down
25 changes: 21 additions & 4 deletions manage/airflow/dags/update_storage_listings.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
'update_storage_listings',
description='Generate file store lists for reporting purposes.',
default_args=default_args,
schedule_interval='@daily',
schedule_interval='0 4 * * *',
start_date=days_ago(1),
catchup=False,
max_active_runs=1,
Expand All @@ -56,30 +56,33 @@
- Lists all the current files known to TrackDB.
- Lists all the web archive files on AWS S3.
It runs at 4am, so the daily TrackDB update should be complete by the time it runs.
Configuration:
* Uses AWS keys defined in an Airflow Connection called `amazon_s3`.
How to check it's working:
* Fresh JSONL listings in {c.storage_path}: `trackdb_list.jsonl`, `aws_s3_list.jsonl`
* The internal website is updated with fresh [reports](https://www.webarchive.org.uk/act/static/reports/), served from {c.w3act_static_web_root}.
* The [holdings report](https://www.webarchive.org.uk/act/nbapps/voila/render/ukwa-holdings-summary-report.ipynb) is working and presenting that data.
Tool container versions:
* UKWA Manage Task Image: `{c.ukwa_task_image}`
* UKWA Reports Image: `{c.ukwa_reports_image}`
"""

tdb = DockerOperator(
tdb_lister = DockerOperator(
task_id='export_trackdb_list',
image=c.ukwa_task_image,
# Using bash to redirect stdout:
command="bash -c 'python -m lib.filedb.trackdb_lister > /storage/trackdb_list.jsonl'",
)

tdb = DockerOperator(
s3_lister = DockerOperator(
task_id='export_aws_s3_list',
image=c.ukwa_task_image,
# Using bash to redirect stdout:
Expand All @@ -91,3 +94,17 @@
},
)

reports = DockerOperator(
task_id='update_ukwa_reports',
image=c.ukwa_reports_image,
environment={
'TRACKDB_LIST_JSONL': "/storage/trackdb_list.jsonl",
'AWS_S3_LIST_JSONL': "/storage/aws_s3_list.jsonl",
'OUTPUT_PATH': f'{c.w3act_static_web_root}/reports',
},
)

# Run the listers first
[ tdb_lister, s3_lister ] >> reports


0 comments on commit 53b37d6

Please sign in to comment.