From ac56ac60ed648f91eaa6e867438c7e680dbaace2 Mon Sep 17 00:00:00 2001 From: Brian Mesick Date: Wed, 28 Feb 2024 11:15:00 -0500 Subject: [PATCH] feat: Use dbt state to avoid large table rebuilds --- tutoraspects/commands_v1.py | 17 +++++++++++++--- tutoraspects/patches/k8s-jobs | 4 ++++ tutoraspects/patches/k8s-volumes | 15 ++++++++++++++ .../local-docker-compose-jobs-services | 1 + tutoraspects/plugin.py | 2 ++ .../aspects/apps/aspects/scripts/dbt.sh | 20 +++++++++++++++++-- .../aspects/jobs/init/aspects/init-aspects.sh | 2 +- 7 files changed, 55 insertions(+), 6 deletions(-) diff --git a/tutoraspects/commands_v1.py b/tutoraspects/commands_v1.py index 04a1f4052..ee54e0cf3 100644 --- a/tutoraspects/commands_v1.py +++ b/tutoraspects/commands_v1.py @@ -52,7 +52,18 @@ def load_xapi_test_data(config_file) -> list[tuple[str, str]]: tutor local do dbt -c "run -m enrollments_by_day --threads 4" """, ) -def dbt(command: string) -> list[tuple[str, str]]: +@click.option( + "--only_changed", + default=True, + type=click.UNPROCESSED, + help="""Whether to only run models that have changed since the last dbt run. Since + re-running materialized views will recreate potentially huge datasets and + incur downtime, this defaults to true. + + If no prior state is found, the command will run as if this was False. + """, +) +def dbt(only_changed: bool, command: string) -> list[tuple[str, str]]: """ Job that proxies dbt commands to a container which runs them against ClickHouse. """ @@ -60,8 +71,8 @@ def dbt(command: string) -> list[tuple[str, str]]: ( "aspects", "echo 'Making dbt script executable...' && " - f"echo 'Running dbt {command}' && " - f"bash /app/aspects/scripts/dbt.sh {command} && " + f"echo 'Running dbt, only_changed: {only_changed} command: {command}' && " + f"bash /app/aspects/scripts/dbt.sh {only_changed} {command} && " "echo 'Done!';", ), ] diff --git a/tutoraspects/patches/k8s-jobs b/tutoraspects/patches/k8s-jobs index 57fe1428e..36eaf4b23 100644 --- a/tutoraspects/patches/k8s-jobs +++ b/tutoraspects/patches/k8s-jobs @@ -18,6 +18,8 @@ spec: value: {{ ASPECTS_XAPI_DATABASE }} - name: ASPECTS_ENROLLMENT_EVENTS_TABLE value: {{ ASPECTS_ENROLLMENT_EVENTS_TABLE }} + - name: DBT_STATE + value: {{ DBT_STATE_DIR }} image: {{ DOCKER_IMAGE_ASPECTS }} securityContext: allowPrivilegeEscalation: false @@ -35,6 +37,8 @@ spec: name: alembic - mountPath: /app/aspects/migrations/alembic/versions name: versions + - mountPath: {{ DBT_STATE_DIR }} + name: dbt-state volumes: - name: scripts configMap: diff --git a/tutoraspects/patches/k8s-volumes b/tutoraspects/patches/k8s-volumes index 82b32dac2..44117141f 100644 --- a/tutoraspects/patches/k8s-volumes +++ b/tutoraspects/patches/k8s-volumes @@ -1,3 +1,18 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: aspects-dbt + labels: + app.kubernetes.io/component: volume + app.kubernetes.io/name: aspects-dbt +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10M + {% if RUN_CLICKHOUSE %} --- apiVersion: v1 diff --git a/tutoraspects/patches/local-docker-compose-jobs-services b/tutoraspects/patches/local-docker-compose-jobs-services index 9e0ac64a6..334129c75 100644 --- a/tutoraspects/patches/local-docker-compose-jobs-services +++ b/tutoraspects/patches/local-docker-compose-jobs-services @@ -7,6 +7,7 @@ aspects-job: volumes: - ../../env/plugins/aspects/apps/aspects:/app/aspects - ../../env/plugins/aspects/apps/aspects/scripts/:/app/aspects/scripts:ro + - ../../data/aspects-dbt:{{ DBT_STATE_DIR }} {% if RUN_SUPERSET or RUN_CLICKHOUSE or RUN_RALPH %}depends_on:{% if RUN_SUPERSET %} - superset{% endif %}{% if RUN_CLICKHOUSE%} - clickhouse{% endif %}{% if RUN_RALPH %} diff --git a/tutoraspects/plugin.py b/tutoraspects/plugin.py index adff9b6f1..f5916805d 100644 --- a/tutoraspects/plugin.py +++ b/tutoraspects/plugin.py @@ -353,6 +353,8 @@ ("DBT_REPOSITORY", "https://github.com/openedx/aspects-dbt"), ("DBT_BRANCH", "v3.5.0"), ("DBT_SSH_KEY", ""), + + ("DBT_STATE_DIR", "/app/aspects/dbt_state/"), # This is a pip compliant list of Python packages to install to run dbt # make sure packages with versions are enclosed in double quotes ("EXTRA_DBT_PACKAGES", []), diff --git a/tutoraspects/templates/aspects/apps/aspects/scripts/dbt.sh b/tutoraspects/templates/aspects/apps/aspects/scripts/dbt.sh index cca3643ca..b75a53d6d 100644 --- a/tutoraspects/templates/aspects/apps/aspects/scripts/dbt.sh +++ b/tutoraspects/templates/aspects/apps/aspects/scripts/dbt.sh @@ -26,11 +26,27 @@ cd aspects-dbt || exit export ASPECTS_EVENT_SINK_DATABASE={{ASPECTS_EVENT_SINK_DATABASE}} export ASPECTS_XAPI_DATABASE={{ASPECTS_XAPI_DATABASE}} +export DBT_STATE={{ DBT_STATE_DIR }} +export DBT_TARGET_PATH={{ DBT_STATE_DIR }} echo "Installing dbt dependencies" dbt deps --profiles-dir /app/aspects/dbt/ -echo "Running dbt $*" -dbt "$@" --profiles-dir /app/aspects/dbt/ +echo "Running dbt ${@:2}" + +if [ "$1" == "True" ] +then + echo "Requested to only run modified state, checking for ${DBT_STATE}/manifest.json" +fi + +# If state exists and we've asked to only run changed files, add the flag +if [ "$1" == "True" ] && [ -e "${DBT_STATE}/manifest.json" ] +then + echo "Found {{DBT_STATE_DIR}}/manifest.json so only running modified items" + dbt "${@:2}" --profiles-dir /app/aspects/dbt/ -s state:modified +else + echo "Running command without state:modified+" + dbt "${@:2}" --profiles-dir /app/aspects/dbt/ +fi rm -rf aspects-dbt diff --git a/tutoraspects/templates/aspects/jobs/init/aspects/init-aspects.sh b/tutoraspects/templates/aspects/jobs/init/aspects/init-aspects.sh index 816b96ded..98ee08858 100644 --- a/tutoraspects/templates/aspects/jobs/init/aspects/init-aspects.sh +++ b/tutoraspects/templates/aspects/jobs/init/aspects/init-aspects.sh @@ -2,4 +2,4 @@ bash /app/aspects/scripts/alembic.sh upgrade head -bash /app/aspects/scripts/dbt.sh run +bash /app/aspects/scripts/dbt.sh True run