From e034246952c35a5846caece37caccfc099d739bd Mon Sep 17 00:00:00 2001 From: Brian Mesick Date: Wed, 28 Feb 2024 11:15:00 -0500 Subject: [PATCH] feat: Use dbt state to avoid large table rebuilds --- .github/workflows/integration-test.yml | 31 ++++++++++++++++--- tutoraspects/commands_v0.py | 18 +++++++++-- tutoraspects/commands_v1.py | 20 ++++++++++-- tutoraspects/patches/k8s-jobs | 7 +++++ tutoraspects/patches/k8s-volumes | 15 +++++++++ .../local-docker-compose-jobs-services | 1 + tutoraspects/plugin.py | 1 + .../aspects/apps/aspects/scripts/dbt.sh | 23 ++++++++++++-- .../aspects/jobs/init/aspects/init-aspects.sh | 2 +- 9 files changed, 104 insertions(+), 14 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index bb2938ed9..7bbd8c9de 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -57,8 +57,15 @@ jobs: run: | tutor local do alembic -c "downgrade base" tutor local do alembic -c "upgrade head" - - name: Test DBT - run: tutor local do dbt -c "test" + # This should: + # 1. Find no models on the first run, since state should be up to date + # 2. Force run all models + # 3. Successfully run tests + - name: Test dbt + run: | + tutor local do dbt -c "run" + tutor local do dbt --only_changed False -c "run" + tutor local do dbt --only_changed False -c "test" - name: Load test run: tutor local do load-xapi-test-data - name: Import demo course @@ -111,8 +118,15 @@ jobs: run: | tutor dev do alembic -c "downgrade base" tutor dev do alembic -c "upgrade head" - - name: Test DBT - run: tutor dev do dbt -c "test" + # This should: + # 1. Find no models on the first run, since state should be up to date + # 2. Force run all models + # 3. Successfully run tests + - name: Test dbt + run: | + tutor dev do dbt -c "run" + tutor dev do dbt --only_changed False -c "run" + tutor dev do dbt --only_changed False -c "test" - name: Load test run: tutor dev do load-xapi-test-data - name: Import demo course @@ -183,8 +197,15 @@ jobs: run: | tutor k8s do alembic -c "downgrade base" tutor k8s do alembic -c "upgrade head" + # This should: + # 1. Find no models on the first run, since state should be up to date + # 2. Force run all models + # 3. Successfully run tests - name: Test DBT - run: tutor k8s do dbt -c "test" + run: | + tutor k8s do dbt -c "run" + tutor k8s do dbt --only_changed False -c "run" + tutor k8s do dbt --only_changed False -c "test" - name: Load test run: tutor k8s do load-xapi-test-data - name: Import demo course diff --git a/tutoraspects/commands_v0.py b/tutoraspects/commands_v0.py index 930ed937e..2a4ab6522 100644 --- a/tutoraspects/commands_v0.py +++ b/tutoraspects/commands_v0.py @@ -23,8 +23,22 @@ tutor local do dbt -c "run -m enrollments_by_day --threads 4" """, ) +@click.option( + "--only_changed", + default=True, + type=click.UNPROCESSED, + help="""Whether to only run models that have changed since the last dbt run. Since + re-running materialized views will recreate potentially huge datasets and + incur downtime, this defaults to true. + + If no prior state is found, the command will run as if this was False. + + If your command fails due to an issue with "state:modified", you may need to + set this to False. + """, +) @click.pass_obj -def dbt(context, command) -> None: +def dbt(context, only_changed, command) -> None: """ Job that proxies dbt commands to a container which runs them against ClickHouse. """ @@ -33,7 +47,7 @@ def dbt(context, command) -> None: command = f"""echo 'Making dbt script executable...' echo 'Running dbt {command}' - bash /app/aspects/scripts/dbt.sh {command} + bash /app/aspects/scripts/dbt.sh {only_changed} {command} echo 'Done!'; """ runner.run_job("aspects", command) diff --git a/tutoraspects/commands_v1.py b/tutoraspects/commands_v1.py index 04a1f4052..593a1b883 100644 --- a/tutoraspects/commands_v1.py +++ b/tutoraspects/commands_v1.py @@ -52,7 +52,21 @@ def load_xapi_test_data(config_file) -> list[tuple[str, str]]: tutor local do dbt -c "run -m enrollments_by_day --threads 4" """, ) -def dbt(command: string) -> list[tuple[str, str]]: +@click.option( + "--only_changed", + default=True, + type=click.UNPROCESSED, + help="""Whether to only run models that have changed since the last dbt run. Since + re-running materialized views will recreate potentially huge datasets and + incur downtime, this defaults to true. + + If no prior state is found, the command will run as if this was False. + + If your command fails due to an issue with "state:modified", you may need to + set this to False. + """, +) +def dbt(only_changed: bool, command: string) -> list[tuple[str, str]]: """ Job that proxies dbt commands to a container which runs them against ClickHouse. """ @@ -60,8 +74,8 @@ def dbt(command: string) -> list[tuple[str, str]]: ( "aspects", "echo 'Making dbt script executable...' && " - f"echo 'Running dbt {command}' && " - f"bash /app/aspects/scripts/dbt.sh {command} && " + f"echo 'Running dbt, only_changed: {only_changed} command: {command}' && " + f"bash /app/aspects/scripts/dbt.sh {only_changed} {command} && " "echo 'Done!';", ), ] diff --git a/tutoraspects/patches/k8s-jobs b/tutoraspects/patches/k8s-jobs index 57fe1428e..8a03007bb 100644 --- a/tutoraspects/patches/k8s-jobs +++ b/tutoraspects/patches/k8s-jobs @@ -18,6 +18,8 @@ spec: value: {{ ASPECTS_XAPI_DATABASE }} - name: ASPECTS_ENROLLMENT_EVENTS_TABLE value: {{ ASPECTS_ENROLLMENT_EVENTS_TABLE }} + - name: DBT_STATE + value: {{ DBT_STATE_DIR }} image: {{ DOCKER_IMAGE_ASPECTS }} securityContext: allowPrivilegeEscalation: false @@ -35,7 +37,12 @@ spec: name: alembic - mountPath: /app/aspects/migrations/alembic/versions name: versions + - mountPath: {{ DBT_STATE_DIR }} + name: dbt-state volumes: + - name: dbt-state + persistentVolumeClaim: + claimName: aspects-dbt - name: scripts configMap: name: aspects-scripts diff --git a/tutoraspects/patches/k8s-volumes b/tutoraspects/patches/k8s-volumes index 82b32dac2..44117141f 100644 --- a/tutoraspects/patches/k8s-volumes +++ b/tutoraspects/patches/k8s-volumes @@ -1,3 +1,18 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: aspects-dbt + labels: + app.kubernetes.io/component: volume + app.kubernetes.io/name: aspects-dbt +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10M + {% if RUN_CLICKHOUSE %} --- apiVersion: v1 diff --git a/tutoraspects/patches/local-docker-compose-jobs-services b/tutoraspects/patches/local-docker-compose-jobs-services index 9e0ac64a6..334129c75 100644 --- a/tutoraspects/patches/local-docker-compose-jobs-services +++ b/tutoraspects/patches/local-docker-compose-jobs-services @@ -7,6 +7,7 @@ aspects-job: volumes: - ../../env/plugins/aspects/apps/aspects:/app/aspects - ../../env/plugins/aspects/apps/aspects/scripts/:/app/aspects/scripts:ro + - ../../data/aspects-dbt:{{ DBT_STATE_DIR }} {% if RUN_SUPERSET or RUN_CLICKHOUSE or RUN_RALPH %}depends_on:{% if RUN_SUPERSET %} - superset{% endif %}{% if RUN_CLICKHOUSE%} - clickhouse{% endif %}{% if RUN_RALPH %} diff --git a/tutoraspects/plugin.py b/tutoraspects/plugin.py index adff9b6f1..d6b48d1d0 100644 --- a/tutoraspects/plugin.py +++ b/tutoraspects/plugin.py @@ -353,6 +353,7 @@ ("DBT_REPOSITORY", "https://github.com/openedx/aspects-dbt"), ("DBT_BRANCH", "v3.5.0"), ("DBT_SSH_KEY", ""), + ("DBT_STATE_DIR", "/app/aspects/dbt_state/"), # This is a pip compliant list of Python packages to install to run dbt # make sure packages with versions are enclosed in double quotes ("EXTRA_DBT_PACKAGES", []), diff --git a/tutoraspects/templates/aspects/apps/aspects/scripts/dbt.sh b/tutoraspects/templates/aspects/apps/aspects/scripts/dbt.sh index cca3643ca..4109d9c79 100644 --- a/tutoraspects/templates/aspects/apps/aspects/scripts/dbt.sh +++ b/tutoraspects/templates/aspects/apps/aspects/scripts/dbt.sh @@ -26,11 +26,28 @@ cd aspects-dbt || exit export ASPECTS_EVENT_SINK_DATABASE={{ASPECTS_EVENT_SINK_DATABASE}} export ASPECTS_XAPI_DATABASE={{ASPECTS_XAPI_DATABASE}} +export DBT_STATE={{ DBT_STATE_DIR }} echo "Installing dbt dependencies" dbt deps --profiles-dir /app/aspects/dbt/ -echo "Running dbt $*" -dbt "$@" --profiles-dir /app/aspects/dbt/ - +echo "Running dbt ${@:2}" + +if [ "$1" == "True" ] +then + echo "Requested to only run modified state, checking for ${DBT_STATE}/manifest.json" +fi + +# If state exists and we've asked to only run changed files, add the flag +if [ "$1" == "True" ] && [ -e "${DBT_STATE}/manifest.json" ] +then + echo "Found {{DBT_STATE_DIR}}/manifest.json so only running modified items and their downstreams" + dbt "${@:2}" --profiles-dir /app/aspects/dbt/ -s state:modified+ +else + echo "Running command *without* state:modified+ this may take a long time." + dbt "${@:2}" --profiles-dir /app/aspects/dbt/ +fi + +rm -rf ${DBT_STATE}/* +cp -r ./target/manifest.json ${DBT_STATE} rm -rf aspects-dbt diff --git a/tutoraspects/templates/aspects/jobs/init/aspects/init-aspects.sh b/tutoraspects/templates/aspects/jobs/init/aspects/init-aspects.sh index 816b96ded..98ee08858 100644 --- a/tutoraspects/templates/aspects/jobs/init/aspects/init-aspects.sh +++ b/tutoraspects/templates/aspects/jobs/init/aspects/init-aspects.sh @@ -2,4 +2,4 @@ bash /app/aspects/scripts/alembic.sh upgrade head -bash /app/aspects/scripts/dbt.sh run +bash /app/aspects/scripts/dbt.sh True run