Canvas sandbox dag (#174)

* create sandbox-dag * fix comment * update variables * fix: debug dag for test project * fix: dataset name changed * fix: set check for replace dataset and project name for test * feat: inserted a check for existing tables and change views source to dbt dataset * fix: inseted credentials in client for bigquery * fix: insert gcp authentication in ci test * dag update for following airflow best practices * update sandbox dataset variable and schedule interval
stellar · Jul 11, 2023 · 8bfb420 · 8bfb420
1 parent 7f52a1e
commit 8bfb420
Show file tree

Hide file tree

Showing 9 changed files with 190 additions and 2 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -51,6 +51,11 @@ jobs:
       - name: Import Airflow variables
         run: airflow variables import airflow_variables_ci.json
 
+      - name: Authenticate to test-hubble GCP
+        uses: google-github-actions/auth@v1
+        with:
+          credentials_json: "${{ secrets.CREDS_TEST_HUBBLE }}"
+
       - name: Pytest
         run: pytest dags/
 

diff --git a/.sqlfluff b/.sqlfluff
@@ -73,6 +73,8 @@ unwrap_wrapped_queries = True
 [sqlfluff:templater:python:context]
 project_id = project_id
 dataset_id = dataset_id
+target_dataset = target_dataset
+table_id = table_id
 batch_id = batch_id
 batch_run_date = batch_run_date
 prev_batch_run_date = prev_batch_run_date

diff --git a/airflow_variables.json b/airflow_variables.json
@@ -258,7 +258,8 @@
     "signers": "account_signers",
     "trades": "history_trades",
     "transactions": "history_transactions",
-    "trustlines": "trust_lines"
+    "trustlines": "trust_lines",
+    "enriched_history_operations": "enriched_history_operations"
   },
   "task_timeout": {
     "build_batch_stats": 180,
@@ -270,7 +271,15 @@
     "build_gcs_to_bq_task": 300,
     "build_time_task": 120
   },
+  "dbt_tables": {
+    "signers_current": "account_signers_current",
+    "accounts_current": "accounts_current",
+    "liquidity_pools_current": "liquidity_pools_current",
+    "offers_current": "offers_current",
+    "trustlines_current": "trust_lines_current"
+  },
   "use_testnet": "False",
+  "sandbox_dataset": "crypto_stellar_internal_sandbox",
   "volume_config": "{}",
   "volume_name": "etl-data"
 }
diff --git a/airflow_variables_dev.json b/airflow_variables_dev.json
@@ -237,7 +237,8 @@
     "signers": "account_signers",
     "trades": "history_trades",
     "transactions": "history_transactions",
-    "trustlines": "trust_lines"
+    "trustlines": "trust_lines",
+    "enriched_history_operations": "enriched_history_operations"
   },
   "task_timeout": {
     "build_batch_stats": 180,
@@ -249,7 +250,15 @@
     "build_gcs_to_bq_task": 300,
     "build_time_task": 120
   },
+  "dbt_tables": {
+    "signers_current": "account_signers_current",
+    "accounts_current": "accounts_current",
+    "liquidity_pools_current": "liquidity_pools_current",
+    "offers_current": "offers_current",
+    "trustlines_current": "trust_lines_current"
+  },
   "use_testnet": "True",
+  "sandbox_dataset": "crypto_stellar_internal_sandbox",
   "volume_config": {},
   "volume_name": "etl-data"
 }
diff --git a/dags/queries/create_table.sql b/dags/queries/create_table.sql
@@ -0,0 +1,7 @@
+create or replace table `{project_id}.{target_dataset}.{table_id}`
+partition by date_trunc(batch_run_date, month)
+options (partition_expiration_days = 180) as (
+    select *
+    from `{project_id}.{dataset_id}.{table_id}`
+    where batch_run_date >= date_sub(current_date(), interval 6 month)
+)
diff --git a/dags/queries/create_view.sql b/dags/queries/create_view.sql
@@ -0,0 +1,3 @@
+create or replace view `{project_id}.{target_dataset}.{table_id}` as (
+    select * from `{project_id}.{dataset_id}.{table_id}`
+)
diff --git a/dags/queries/update_table.sql b/dags/queries/update_table.sql
@@ -0,0 +1,4 @@
+insert into {project_id}.{target_dataset}.{table_id}
+select *
+from {project_id}.{dataset_id}.{table_id}
+where date_trunc(batch_run_date, month) = date_trunc(current_date() - interval 1 month, month)
diff --git a/dags/sandbox_create_dag.py b/dags/sandbox_create_dag.py
@@ -0,0 +1,88 @@
+"""
+This DAG creates the sandbox dataset with transactions tables, state tables with history and views.
+"""
+import datetime
+import json
+
+from airflow import DAG
+from airflow.models.variable import Variable
+from airflow.operators.empty import EmptyOperator
+from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
+from stellar_etl_airflow.build_bq_insert_job_task import (
+    file_to_string,
+    get_query_filepath,
+)
+from stellar_etl_airflow.default import (
+    alert_after_max_retries,
+    get_default_dag_args,
+    init_sentry,
+)
+
+init_sentry()
+
+with DAG(
+    "sandbox_create_dag",
+    default_args=get_default_dag_args(),
+    start_date=datetime.datetime(2023, 1, 1),
+    description="This DAG creates a sandbox",
+    schedule_interval="@once",
+    params={"alias": "sandbox_dataset"},
+    user_defined_filters={
+        "fromjson": lambda s: json.loads(s),
+    },
+    catchup=False,
+) as dag:
+    PROJECT = Variable.get("bq_project")
+    DATASET = Variable.get("bq_dataset")
+    SANDBOX_DATASET = Variable.get("sandbox_dataset")
+    DBT_DATASET = Variable.get("dbt_mart_dataset")
+    TABLES_ID = Variable.get("table_ids", deserialize_json=True)
+    DBT_TABLES = Variable.get("dbt_tables", deserialize_json=True)
+
+    start_tables_task = EmptyOperator(task_id="start_tables_task")
+    start_views_task = EmptyOperator(task_id="start_views_task")
+
+    query_path = get_query_filepath("create_table")
+    query = file_to_string(query_path)
+    for table_id in TABLES_ID:
+        sql_params = {
+            "project_id": PROJECT,
+            "dataset_id": DATASET,
+            "table_id": TABLES_ID[table_id],
+            "target_dataset": SANDBOX_DATASET,
+        }
+        query = query.format(**sql_params)
+        tables_create_task = BigQueryInsertJobOperator(
+            task_id=f"create_{table_id}",
+            configuration={
+                "query": {
+                    "query": query,
+                    "useLegacySql": False,
+                }
+            },
+            on_failure_callback=alert_after_max_retries,
+        )
+
+        start_tables_task >> tables_create_task
+
+    query_path = get_query_filepath("create_view")
+    query = file_to_string(query_path)
+    for dbt_table in DBT_TABLES:
+        sql_params = {
+            "project_id": PROJECT,
+            "dataset_id": DBT_DATASET,
+            "table_id": DBT_TABLES[dbt_table],
+            "target_dataset": SANDBOX_DATASET,
+        }
+        query = query.format(**sql_params)
+        dbt_tables_create_task = BigQueryInsertJobOperator(
+            task_id=f"create_{dbt_table}",
+            configuration={
+                "query": {
+                    "query": query,
+                    "useLegacySql": False,
+                }
+            },
+            on_failure_callback=alert_after_max_retries,
+        )
+        start_views_task >> dbt_tables_create_task
diff --git a/dags/sandbox_update_dag.py b/dags/sandbox_update_dag.py
@@ -0,0 +1,61 @@
+"""
+This DAG update the Canvas sandbox dataset with transactions tables, state tables with history once a month.
+"""
+import datetime
+import json
+
+from airflow import DAG
+from airflow.models.variable import Variable
+from airflow.operators.empty import EmptyOperator
+from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
+from stellar_etl_airflow.build_bq_insert_job_task import (
+    file_to_string,
+    get_query_filepath,
+)
+from stellar_etl_airflow.default import (
+    alert_after_max_retries,
+    get_default_dag_args,
+    init_sentry,
+)
+
+init_sentry()
+
+with DAG(
+    "sandbox_update_dag",
+    default_args=get_default_dag_args(),
+    start_date=datetime.datetime(2023, 1, 1),
+    description="This DAG updates a sandbox",
+    schedule_interval="0 6 1 * *",
+    params={"alias": "sandbox_dataset"},
+    user_defined_filters={"fromjson": lambda s: json.loads(s)},
+    catchup=False,
+) as dag:
+    TABLES_ID = Variable.get("table_ids", deserialize_json=True)
+    PROJECT = Variable.get("bq_project")
+    BQ_DATASET = Variable.get("bq_dataset")
+    SANDBOX_DATASET = Variable.get("sandbox_dataset")
+
+    start_tables_task = EmptyOperator(task_id="start_tables_task")
+
+    query_path = get_query_filepath("update_table")
+    query = file_to_string(query_path)
+    for table_id in TABLES_ID:
+        sql_params = {
+            "project_id": PROJECT,
+            "dataset_id": BQ_DATASET,
+            "table_id": TABLES_ID[table_id],
+            "target_dataset": SANDBOX_DATASET,
+        }
+        query = query.format(**sql_params)
+        tables_update_task = BigQueryInsertJobOperator(
+            task_id=f"update_{table_id}",
+            configuration={
+                "query": {
+                    "query": query,
+                    "useLegacySql": False,
+                }
+            },
+            on_failure_callback=alert_after_max_retries,
+        )
+
+        start_tables_task >> tables_update_task