Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate avro files from bq tables #507

Merged
merged 13 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .sqlfluff
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ batch_id = batch_id
batch_run_date = batch_run_date
prev_batch_run_date = prev_batch_run_date
next_batch_run_date = next_batch_run_date
uri = uri

# Some rules can be configured directly from the config common to other rules
[sqlfluff:rules]
Expand Down
3 changes: 3 additions & 0 deletions airflow_variables_dev.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"api_key_path": "/home/airflow/gcs/data/apiKey.json",
"avro_gcs_bucket": "test_dune_bucket_sdf",
"bq_dataset": "test_crypto_stellar_internal",
"bq_dataset_audit_log": "audit_log",
"bq_project": "test-hubble-319619",
Expand Down Expand Up @@ -333,6 +334,7 @@
"task_sla": {
"asset_stats": 720,
"build_batch_stats": 840,
"build_bq_generate_avro_job": 600,
"build_bq_insert_job": 1080,
"build_del_ins_from_gcs_to_bq_task": 2000,
"build_delete_data_task": 1020,
Expand Down Expand Up @@ -366,6 +368,7 @@
},
"task_timeout": {
"build_batch_stats": 180,
"build_bq_generate_avro_job": 600,
"build_bq_insert_job": 180,
"build_copy_table": 180,
"build_dbt_task": 960,
Expand Down
3 changes: 3 additions & 0 deletions airflow_variables_prod.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"api_key_path": "/home/airflow/gcs/data/apiKey.json",
"avro_gcs_bucket": "dune_bucket_sdf",
"bq_dataset": "crypto_stellar_internal_2",
"bq_dataset_audit_log": "audit_log",
"bq_project": "hubble-261722",
Expand Down Expand Up @@ -331,6 +332,7 @@
"task_sla": {
"asset_stats": 420,
"build_batch_stats": 600,
"build_bq_generate_avro_job": 600,
"build_bq_insert_job": 840,
"build_del_ins_from_gcs_to_bq_task": 2000,
"build_delete_data_task": 780,
Expand Down Expand Up @@ -364,6 +366,7 @@
},
"task_timeout": {
"build_batch_stats": 180,
"build_bq_generate_avro_job": 600,
"build_bq_insert_job": 180,
"build_copy_table": 180,
"build_dbt_task": 1800,
Expand Down
74 changes: 74 additions & 0 deletions dags/generate_avro_files_dag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from datetime import datetime

from airflow import DAG
from airflow.operators.dummy import DummyOperator
from stellar_etl_airflow import macros
from stellar_etl_airflow.build_bq_generate_avro_job_task import (
build_bq_generate_avro_job,
)
from stellar_etl_airflow.build_cross_dependency_task import build_cross_deps
from stellar_etl_airflow.default import (
alert_sla_miss,
get_default_dag_args,
init_sentry,
)

init_sentry()

dag = DAG(
"generate_avro",
default_args=get_default_dag_args(),
start_date=datetime(2024, 10, 1, 1, 0),
catchup=True,
description="This DAG generates AVRO files from BQ tables",
schedule_interval="0 * * * *",
render_template_as_native_obj=True,
user_defined_macros={
"subtract_data_interval": macros.subtract_data_interval,
"batch_run_date_as_datetime_string": macros.batch_run_date_as_datetime_string,
"batch_run_date_as_directory_string": macros.batch_run_date_as_directory_string,
},
sla_miss_callback=alert_sla_miss,
)

public_project = "{{ var.value.public_project }}"
public_dataset = "{{ var.value.public_dataset }}"
gcs_bucket = "{{ var.value.avro_gcs_bucket }}"


# Wait on ingestion DAGs
wait_on_history_table = build_cross_deps(
dag, "wait_on_ledgers_txs", "history_table_export"
)
wait_on_state_table = build_cross_deps(dag, "wait_on_state_table", "state_table_export")

dummy_task = DummyOperator(task_id="dummy_task", dag=dag)
sydneynotthecity marked this conversation as resolved.
Show resolved Hide resolved

# Generate AVRO files
avro_tables = [
"accounts",
"contract_data",
"history_contract_events",
"history_ledgers",
"history_trades",
"history_transactions",
"liquidity_pools",
"offers",
"trust_lines",
"ttl",
# "history_effects",
# "history_operations",
]

for table in avro_tables:
avro_task = build_bq_generate_avro_job(
dag=dag,
project=public_project,
dataset=public_dataset,
table=table,
gcs_bucket=gcs_bucket,
)

dummy_task >> avro_task
wait_on_history_table >> avro_task
wait_on_state_table >> avro_task
20 changes: 20 additions & 0 deletions dags/queries/generate_avro/accounts.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
export data
options (
uri = '{uri}'
, format = 'avro'
, overwrite = true
)
as (
select
*
except (sequence_ledger, batch_id, batch_insert_ts, batch_run_date)
, sequence_ledger as account_sequence_last_modified_ledger
from {project_id}.{dataset_id}.accounts
where
true
and batch_run_date >= '{batch_run_date}'
and batch_run_date < '{next_batch_run_date}'
and closed_at >= '{batch_run_date}'
and closed_at < '{next_batch_run_date}'
order by closed_at asc
)
17 changes: 17 additions & 0 deletions dags/queries/generate_avro/contract_data.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
export data
options (
uri = '{uri}'
, format = 'avro'
, overwrite = true
)
as (
select
*
except (batch_id, batch_insert_ts, batch_run_date)
from {project_id}.{dataset_id}.contract_data
where
true
and closed_at >= '{batch_run_date}'
and closed_at < '{next_batch_run_date}'
order by closed_at asc
)
17 changes: 17 additions & 0 deletions dags/queries/generate_avro/history_contract_events.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
export data
options (
uri = '{uri}'
, format = 'avro'
, overwrite = true
)
as (
select
*
except (batch_id, batch_insert_ts, batch_run_date)
from {project_id}.{dataset_id}.history_contract_events
where
true
and closed_at >= '{batch_run_date}'
and closed_at < '{next_batch_run_date}'
order by closed_at asc
)
21 changes: 21 additions & 0 deletions dags/queries/generate_avro/history_effects.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
export data
options (
uri = '{uri}'
, format = 'avro'
, overwrite = true
)
as (
select
*
except (details, batch_id, batch_insert_ts, batch_run_date)
, details.*
except (predicate)
from {project_id}.{dataset_id}.history_effects
where
true
and batch_run_date >= '{batch_run_date}'
and batch_run_date < '{next_batch_run_date}'
and closed_at >= '{batch_run_date}'
and closed_at < '{next_batch_run_date}'
order by closed_at asc
)
17 changes: 17 additions & 0 deletions dags/queries/generate_avro/history_ledgers.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
export data
options (
uri = '{uri}'
, format = 'avro'
, overwrite = true
)
as (
select
*
except (batch_id, batch_insert_ts, batch_run_date)
from {project_id}.{dataset_id}.history_ledgers
where
true
and closed_at >= '{batch_run_date}'
and closed_at < '{next_batch_run_date}'
order by closed_at asc
)
22 changes: 22 additions & 0 deletions dags/queries/generate_avro/history_operations.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
export data
options (
uri = '{uri}'
, format = 'avro'
, overwrite = true
)
as (
select
*
except (details, details_json, batch_id, batch_insert_ts, batch_run_date)
, details.*
except (claimants, type)
, details.type as soroban_operation_type
from {project_id}.{dataset_id}.history_operations
where
true
and batch_run_date >= '{batch_run_date}'
and batch_run_date < '{next_batch_run_date}'
and closed_at >= '{batch_run_date}'
and closed_at < '{next_batch_run_date}'
order by closed_at asc
)
18 changes: 18 additions & 0 deletions dags/queries/generate_avro/history_trades.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
export data
options (
uri = '{uri}'
, format = 'avro'
, overwrite = true
)
as (
select
*
except (ledger_closed_at, batch_id, batch_insert_ts, batch_run_date)
, ledger_closed_at as closed_at
from {project_id}.{dataset_id}.history_trades
where
true
and ledger_closed_at >= '{batch_run_date}'
and ledger_closed_at < '{next_batch_run_date}'
order by closed_at asc
)
19 changes: 19 additions & 0 deletions dags/queries/generate_avro/history_transactions.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
export data
options (
uri = '{uri}'
, format = 'avro'
, overwrite = true
)
as (
select
*
except (batch_id, batch_insert_ts, batch_run_date)
from {project_id}.{dataset_id}.history_transactions
where
true
and batch_run_date >= '{batch_run_date}'
and batch_run_date < '{next_batch_run_date}'
and closed_at >= '{batch_run_date}'
and closed_at < '{next_batch_run_date}'
order by closed_at asc
)
19 changes: 19 additions & 0 deletions dags/queries/generate_avro/liquidity_pools.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
export data
options (
uri = '{uri}'
, format = 'avro'
, overwrite = true
)
as (
select
*
except (batch_id, batch_insert_ts, batch_run_date)
from {project_id}.{dataset_id}.liquidity_pools
where
true
and batch_run_date >= '{batch_run_date}'
and batch_run_date < '{next_batch_run_date}'
and closed_at >= '{batch_run_date}'
and closed_at < '{next_batch_run_date}'
order by closed_at asc
)
19 changes: 19 additions & 0 deletions dags/queries/generate_avro/offers.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
export data
options (
uri = '{uri}'
, format = 'avro'
, overwrite = true
)
as (
select
*
except (batch_id, batch_insert_ts, batch_run_date)
from {project_id}.{dataset_id}.offers
where
true
and batch_run_date >= '{batch_run_date}'
and batch_run_date < '{next_batch_run_date}'
and closed_at >= '{batch_run_date}'
and closed_at < '{next_batch_run_date}'
order by closed_at asc
)
19 changes: 19 additions & 0 deletions dags/queries/generate_avro/trust_lines.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
export data
options (
uri = '{uri}'
, format = 'avro'
, overwrite = true
)
as (
select
*
except (batch_id, batch_insert_ts, batch_run_date)
from {project_id}.{dataset_id}.trust_lines
where
true
and batch_run_date >= '{batch_run_date}'
and batch_run_date < '{next_batch_run_date}'
and closed_at >= '{batch_run_date}'
and closed_at < '{next_batch_run_date}'
order by closed_at asc
)
17 changes: 17 additions & 0 deletions dags/queries/generate_avro/ttl.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
export data
options (
uri = '{uri}'
, format = 'avro'
, overwrite = true
)
as (
select
*
except (batch_id, batch_insert_ts, batch_run_date)
from {project_id}.{dataset_id}.ttl
where
true
and closed_at >= '{batch_run_date}'
and closed_at < '{next_batch_run_date}'
order by closed_at asc
)
Loading
Loading