From f650e2c3a1de40e008d4108cd915ad83627b1ea5 Mon Sep 17 00:00:00 2001 From: Laura Wrubel Date: Mon, 5 Aug 2024 14:46:22 -0400 Subject: [PATCH] Copy not symlink latest data files --- docker-compose.prod.yaml | 1 + docker-compose.yaml | 3 +-- rialto_airflow/dags/harvest.py | 17 +++++++---------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/docker-compose.prod.yaml b/docker-compose.prod.yaml index be05753..c4cdde8 100644 --- a/docker-compose.prod.yaml +++ b/docker-compose.prod.yaml @@ -87,6 +87,7 @@ x-airflow-common: AIRFLOW_VAR_SUL_PUB_HOST: ${AIRFLOW_VAR_SUL_PUB_HOST} AIRFLOW_VAR_SUL_PUB_KEY: ${AIRFLOW_VAR_SUL_PUB_KEY} AIRFLOW_VAR_DATA_DIR: /opt/airflow/data + AIRFLOW_VAR_PUBLISH_DIR: /opt/airflow/data/latest volumes: - /opt/app/rialto/rialto-airflow/current/rialto_airflow:/opt/airflow/rialto_airflow - /data:/opt/airflow/data diff --git a/docker-compose.yaml b/docker-compose.yaml index dec31a0..5729683 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -80,11 +80,10 @@ x-airflow-common: AIRFLOW_VAR_SUL_PUB_KEY: ${AIRFLOW_VAR_SUL_PUB_KEY} AIRFLOW_VAR_DEV_LIMIT: ${AIRFLOW_VAR_DEV_LIMIT} AIRFLOW_VAR_DATA_DIR: /opt/airflow/data + AIRFLOW_VAR_PUBLISH_DIR: /opt/airflow/data/latest AIRFLOW_VAR_OPENALEX_EMAIL: ${AIRFLOW_VAR_OPENALEX_EMAIL} volumes: - ${AIRFLOW_PROJ_DIR:-.}/rialto_airflow:/opt/airflow/rialto_airflow - # TODO: we may want to put logs and data outside of the project directory so - # they can persist across capistrano deploys? - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs - ${AIRFLOW_PROJ_DIR:-.}/data:/opt/airflow/data user: "503:0" diff --git a/rialto_airflow/dags/harvest.py b/rialto_airflow/dags/harvest.py index 13601c1..e1666b2 100644 --- a/rialto_airflow/dags/harvest.py +++ b/rialto_airflow/dags/harvest.py @@ -1,6 +1,7 @@ import datetime import pickle from pathlib import Path +import shutil from airflow.decorators import dag, task from airflow.models import Variable @@ -12,6 +13,7 @@ from rialto_airflow.utils import create_snapshot_dir, rialto_authors_file data_dir = Variable.get("data_dir") +publish_dir = Variable.get("publish_dir") sul_pub_host = Variable.get("sul_pub_host") sul_pub_key = Variable.get("sul_pub_key") @@ -130,18 +132,13 @@ def publish(pubs_to_contribs, merge_publications): """ Publish aggregate data to JupyterHub environment. """ - contribs_path = Path(data_dir) / "latest" / "contributions.parquet" - pubs_path = Path(data_dir) / "latest" / "publications.parquet" + contribs_path = Path(publish_dir) / "contributions.parquet" + pubs_path = Path(publish_dir) / "publications.parquet" - if contribs_path.exists(): - contribs_path.unlink() - if pubs_path.exists(): - pubs_path.unlink() + shutil.copyfile(pubs_to_contribs, contribs_path) + shutil.copyfile(merge_publications, pubs_path) - contribs_path.symlink_to(pubs_to_contribs) - pubs_path.symlink_to(merge_publications) - - return str(contribs_path), str(pubs_path) + return str(publish_dir) snapshot_dir = setup()