Skip to content

Commit

Permalink
Canvas sandbox dag (#174)
Browse files Browse the repository at this point in the history
* create sandbox-dag

* fix comment

* update variables

* fix: debug dag for test project

* fix: dataset name changed

* fix: set check for replace dataset and project name for test

* feat: inserted a check for existing tables and change views source to dbt dataset

* fix: inseted credentials in client for bigquery

* fix: insert gcp authentication in ci test

* dag update for following airflow best practices

* update sandbox dataset variable and schedule interval
  • Loading branch information
cayod authored Jul 11, 2023
1 parent 7f52a1e commit 8bfb420
Show file tree
Hide file tree
Showing 9 changed files with 190 additions and 2 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ jobs:
- name: Import Airflow variables
run: airflow variables import airflow_variables_ci.json

- name: Authenticate to test-hubble GCP
uses: google-github-actions/auth@v1
with:
credentials_json: "${{ secrets.CREDS_TEST_HUBBLE }}"

- name: Pytest
run: pytest dags/

Expand Down
2 changes: 2 additions & 0 deletions .sqlfluff
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ unwrap_wrapped_queries = True
[sqlfluff:templater:python:context]
project_id = project_id
dataset_id = dataset_id
target_dataset = target_dataset
table_id = table_id
batch_id = batch_id
batch_run_date = batch_run_date
prev_batch_run_date = prev_batch_run_date
Expand Down
11 changes: 10 additions & 1 deletion airflow_variables.json
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,8 @@
"signers": "account_signers",
"trades": "history_trades",
"transactions": "history_transactions",
"trustlines": "trust_lines"
"trustlines": "trust_lines",
"enriched_history_operations": "enriched_history_operations"
},
"task_timeout": {
"build_batch_stats": 180,
Expand All @@ -270,7 +271,15 @@
"build_gcs_to_bq_task": 300,
"build_time_task": 120
},
"dbt_tables": {
"signers_current": "account_signers_current",
"accounts_current": "accounts_current",
"liquidity_pools_current": "liquidity_pools_current",
"offers_current": "offers_current",
"trustlines_current": "trust_lines_current"
},
"use_testnet": "False",
"sandbox_dataset": "crypto_stellar_internal_sandbox",
"volume_config": "{}",
"volume_name": "etl-data"
}
11 changes: 10 additions & 1 deletion airflow_variables_dev.json
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,8 @@
"signers": "account_signers",
"trades": "history_trades",
"transactions": "history_transactions",
"trustlines": "trust_lines"
"trustlines": "trust_lines",
"enriched_history_operations": "enriched_history_operations"
},
"task_timeout": {
"build_batch_stats": 180,
Expand All @@ -249,7 +250,15 @@
"build_gcs_to_bq_task": 300,
"build_time_task": 120
},
"dbt_tables": {
"signers_current": "account_signers_current",
"accounts_current": "accounts_current",
"liquidity_pools_current": "liquidity_pools_current",
"offers_current": "offers_current",
"trustlines_current": "trust_lines_current"
},
"use_testnet": "True",
"sandbox_dataset": "crypto_stellar_internal_sandbox",
"volume_config": {},
"volume_name": "etl-data"
}
7 changes: 7 additions & 0 deletions dags/queries/create_table.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
create or replace table `{project_id}.{target_dataset}.{table_id}`
partition by date_trunc(batch_run_date, month)
options (partition_expiration_days = 180) as (
select *
from `{project_id}.{dataset_id}.{table_id}`
where batch_run_date >= date_sub(current_date(), interval 6 month)
)
3 changes: 3 additions & 0 deletions dags/queries/create_view.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
create or replace view `{project_id}.{target_dataset}.{table_id}` as (
select * from `{project_id}.{dataset_id}.{table_id}`
)
4 changes: 4 additions & 0 deletions dags/queries/update_table.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
insert into {project_id}.{target_dataset}.{table_id}
select *
from {project_id}.{dataset_id}.{table_id}
where date_trunc(batch_run_date, month) = date_trunc(current_date() - interval 1 month, month)
88 changes: 88 additions & 0 deletions dags/sandbox_create_dag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""
This DAG creates the sandbox dataset with transactions tables, state tables with history and views.
"""
import datetime
import json

from airflow import DAG
from airflow.models.variable import Variable
from airflow.operators.empty import EmptyOperator
from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
from stellar_etl_airflow.build_bq_insert_job_task import (
file_to_string,
get_query_filepath,
)
from stellar_etl_airflow.default import (
alert_after_max_retries,
get_default_dag_args,
init_sentry,
)

init_sentry()

with DAG(
"sandbox_create_dag",
default_args=get_default_dag_args(),
start_date=datetime.datetime(2023, 1, 1),
description="This DAG creates a sandbox",
schedule_interval="@once",
params={"alias": "sandbox_dataset"},
user_defined_filters={
"fromjson": lambda s: json.loads(s),
},
catchup=False,
) as dag:
PROJECT = Variable.get("bq_project")
DATASET = Variable.get("bq_dataset")
SANDBOX_DATASET = Variable.get("sandbox_dataset")
DBT_DATASET = Variable.get("dbt_mart_dataset")
TABLES_ID = Variable.get("table_ids", deserialize_json=True)
DBT_TABLES = Variable.get("dbt_tables", deserialize_json=True)

start_tables_task = EmptyOperator(task_id="start_tables_task")
start_views_task = EmptyOperator(task_id="start_views_task")

query_path = get_query_filepath("create_table")
query = file_to_string(query_path)
for table_id in TABLES_ID:
sql_params = {
"project_id": PROJECT,
"dataset_id": DATASET,
"table_id": TABLES_ID[table_id],
"target_dataset": SANDBOX_DATASET,
}
query = query.format(**sql_params)
tables_create_task = BigQueryInsertJobOperator(
task_id=f"create_{table_id}",
configuration={
"query": {
"query": query,
"useLegacySql": False,
}
},
on_failure_callback=alert_after_max_retries,
)

start_tables_task >> tables_create_task

query_path = get_query_filepath("create_view")
query = file_to_string(query_path)
for dbt_table in DBT_TABLES:
sql_params = {
"project_id": PROJECT,
"dataset_id": DBT_DATASET,
"table_id": DBT_TABLES[dbt_table],
"target_dataset": SANDBOX_DATASET,
}
query = query.format(**sql_params)
dbt_tables_create_task = BigQueryInsertJobOperator(
task_id=f"create_{dbt_table}",
configuration={
"query": {
"query": query,
"useLegacySql": False,
}
},
on_failure_callback=alert_after_max_retries,
)
start_views_task >> dbt_tables_create_task
61 changes: 61 additions & 0 deletions dags/sandbox_update_dag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""
This DAG update the Canvas sandbox dataset with transactions tables, state tables with history once a month.
"""
import datetime
import json

from airflow import DAG
from airflow.models.variable import Variable
from airflow.operators.empty import EmptyOperator
from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
from stellar_etl_airflow.build_bq_insert_job_task import (
file_to_string,
get_query_filepath,
)
from stellar_etl_airflow.default import (
alert_after_max_retries,
get_default_dag_args,
init_sentry,
)

init_sentry()

with DAG(
"sandbox_update_dag",
default_args=get_default_dag_args(),
start_date=datetime.datetime(2023, 1, 1),
description="This DAG updates a sandbox",
schedule_interval="0 6 1 * *",
params={"alias": "sandbox_dataset"},
user_defined_filters={"fromjson": lambda s: json.loads(s)},
catchup=False,
) as dag:
TABLES_ID = Variable.get("table_ids", deserialize_json=True)
PROJECT = Variable.get("bq_project")
BQ_DATASET = Variable.get("bq_dataset")
SANDBOX_DATASET = Variable.get("sandbox_dataset")

start_tables_task = EmptyOperator(task_id="start_tables_task")

query_path = get_query_filepath("update_table")
query = file_to_string(query_path)
for table_id in TABLES_ID:
sql_params = {
"project_id": PROJECT,
"dataset_id": BQ_DATASET,
"table_id": TABLES_ID[table_id],
"target_dataset": SANDBOX_DATASET,
}
query = query.format(**sql_params)
tables_update_task = BigQueryInsertJobOperator(
task_id=f"update_{table_id}",
configuration={
"query": {
"query": query,
"useLegacySql": False,
}
},
on_failure_callback=alert_after_max_retries,
)

start_tables_task >> tables_update_task

0 comments on commit 8bfb420

Please sign in to comment.