Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate cdp changes to ingest from datastore txmeta files #333

Merged
merged 28 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ jobs:
- id: "get-credentials"
uses: "google-github-actions/get-gke-credentials@v2"
with:
cluster_name: "us-central1-hubble-1pt5-dev-7db0e004-gke"
location: "us-central1-c"
cluster_name: "us-central1-test-hubble-2-5f1f2dbf-gke"
location: "us-central1"

- name: Pytest
run: pytest dags/
Expand Down Expand Up @@ -98,14 +98,14 @@ jobs:
run: python dags/stellar_etl_airflow/add_files_to_composer.py --bucket $BUCKET
env:
GOOGLE_CLOUD_PROJECT: test-hubble-319619
BUCKET: us-central1-hubble-1pt5-dev-7db0e004-bucket
BUCKET: us-central1-test-hubble-2-5f1f2dbf-bucket

- name: Update Airflow variables
uses: actions-hub/gcloud@master
env:
PROJECT_ID: test-hubble-319619
APPLICATION_CREDENTIALS: "${{ secrets.CREDS_TEST_HUBBLE }}"
COMPOSER_ENVIRONMENT: hubble-1pt5-dev
COMPOSER_ENVIRONMENT: test-hubble-2
LOCATION: us-central1
with:
args: >
Expand Down
11 changes: 9 additions & 2 deletions airflow_variables_dev.json
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@
"dbt_threads": 12,
"gcs_exported_data_bucket_name": "us-central1-hubble-1pt5-dev-7db0e004-bucket",
"gcs_exported_object_prefix": "dag-exported",
"image_name": "stellar/stellar-etl:75c9a9c",
"image_name": "chowbao/stellar-etl:cdp-test3",
"image_output_path": "/etl/exported_data/",
"image_pull_policy": "IfNotPresent",
"kube_config_location": "",
Expand Down Expand Up @@ -343,5 +343,12 @@
"public_source_schema": "test_crypto_stellar",
"slack_elementary_channel": "stellar-elementary-alerts",
"elementary_secret": "slack-token-elementary",
"dbt_elementary_dataset": "test_elementary"
"dbt_elementary_dataset": "test_elementary",
"use_captive_core": "True",
"txmeta_datastore_url": "gcs://exporter-test/ledgers/testnet",
"cdp_use_captive_core": "False",
"cdp_bq_project": "test-hubble-319619",
"cdp_bq_dataset": "cdp_test_crypto_stellar_internal",
"cdp_public_project": "test-hubble-319619",
"cdp_public_dataset": "cdp_test_crypto_stellar"
}
315 changes: 315 additions & 0 deletions dags/cdp_history_archive_with_captive_core_combined_export_dag.py
sydneynotthecity marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,315 @@
"""
The history_archive_export DAG exports operations and trades from the history archives.
It is scheduled to export information to BigQuery at regular intervals.
"""
from ast import literal_eval
from datetime import datetime
from json import loads

from airflow import DAG
from airflow.models.variable import Variable
from kubernetes.client import models as k8s
from stellar_etl_airflow import macros
from stellar_etl_airflow.build_batch_stats import build_batch_stats
from stellar_etl_airflow.build_bq_insert_job_task import build_bq_insert_job
from stellar_etl_airflow.build_cross_dependency_task import build_cross_deps
from stellar_etl_airflow.build_delete_data_task import build_delete_data_task
from stellar_etl_airflow.build_export_task import build_export_task
from stellar_etl_airflow.build_gcs_to_bq_task import build_gcs_to_bq_task
from stellar_etl_airflow.build_time_task import build_time_task
from stellar_etl_airflow.default import get_default_dag_args, init_sentry

init_sentry()

dag = DAG(
"cdp_history_archive_with_captive_core_combined_export",
default_args=get_default_dag_args(),
start_date=datetime(2024, 3, 22, 0, 0),
catchup=True,
description="This DAG exports all history_* base tables using CaptiveCore. The DAG is a temporary fix, and not suited for public use.",
schedule_interval="*/30 * * * *",
params={
"alias": "cc",
},
render_template_as_native_obj=True,
user_defined_filters={
"fromjson": lambda s: loads(s),
"container_resources": lambda s: k8s.V1ResourceRequirements(requests=s),
"literal_eval": lambda e: literal_eval(e),
},
user_defined_macros={
"subtract_data_interval": macros.subtract_data_interval,
"batch_run_date_as_datetime_string": macros.batch_run_date_as_datetime_string,
},
)

table_names = Variable.get("table_ids", deserialize_json=True)
internal_project = "{{ var.value.cdp_bq_project }}"
internal_dataset = "{{ var.value.cdp_bq_dataset }}"
public_project = "{{ var.value.cdp_public_project }}"
public_dataset = "{{ var.value.cdp_public_dataset }}"
use_testnet = literal_eval(Variable.get("use_testnet"))
use_futurenet = literal_eval(Variable.get("use_futurenet"))
use_captive_core = literal_eval(Variable.get("cdp_use_captive_core"))
txmeta_datastore_url = "{{ var.value.txmeta_datastore_url }}"

"""
The time task reads in the execution time of the current run, as well as the next
execution time. It converts these two times into ledger ranges.
"""
time_task = build_time_task(dag, use_testnet=use_testnet, use_futurenet=use_futurenet)

"""
The write batch stats task will take a snapshot of the DAG run_id, execution date,
start and end ledgers so that reconciliation and data validation are easier. The
record is written to an internal dataset for data eng use only.
"""
write_op_stats = build_batch_stats(dag, table_names["operations"])
write_trade_stats = build_batch_stats(dag, table_names["trades"])
write_effects_stats = build_batch_stats(dag, table_names["effects"])
write_tx_stats = build_batch_stats(dag, table_names["transactions"])
write_diagnostic_events_stats = build_batch_stats(dag, table_names["diagnostic_events"])

"""
The export tasks call export commands on the Stellar ETL using the ledger range from the time task.
The results of the command are stored in a file. There is one task for each of the data types that
can be exported from the history archives.

The DAG sleeps for 30 seconds after the export_task writes to the file to give the poststart.sh
script time to copy the file over to the correct directory. If there is no sleep, the load task
starts prematurely and will not load data.
"""
all_history_export_task = build_export_task(
chowbao marked this conversation as resolved.
Show resolved Hide resolved
dag,
"archive",
"export_all_history",
"all_history",
use_testnet=use_testnet,
use_futurenet=use_futurenet,
use_gcs=True,
resource_cfg="cc",
use_captive_core=use_captive_core,
txmeta_datastore_url=txmeta_datastore_url,
)

"""
The delete partition task checks to see if the given partition/batch id exists in
Bigquery. If it does, the records are deleted prior to reinserting the batch.
"""
delete_old_op_task = build_delete_data_task(
dag, internal_project, internal_dataset, table_names["operations"]
)
delete_old_op_pub_task = build_delete_data_task(
dag, public_project, public_dataset, table_names["operations"], "pub"
)
delete_old_trade_task = build_delete_data_task(
dag, internal_project, internal_dataset, table_names["trades"]
)
delete_old_trade_pub_task = build_delete_data_task(
dag, public_project, public_dataset, table_names["trades"], "pub"
)
delete_enrich_op_task = build_delete_data_task(
dag, internal_project, internal_dataset, "enriched_history_operations"
)
delete_enrich_op_pub_task = build_delete_data_task(
dag, public_project, public_dataset, "enriched_history_operations", "pub"
)
delete_enrich_ma_op_task = build_delete_data_task(
dag, internal_project, internal_dataset, "enriched_meaningful_history_operations"
)
delete_old_effects_task = build_delete_data_task(
dag, internal_project, internal_dataset, table_names["effects"]
)
delete_old_effects_pub_task = build_delete_data_task(
dag, public_project, public_dataset, table_names["effects"], "pub"
)
delete_old_tx_task = build_delete_data_task(
dag, internal_project, internal_dataset, table_names["transactions"]
)
delete_old_tx_pub_task = build_delete_data_task(
dag, public_project, public_dataset, table_names["transactions"], "pub"
)

"""
The send tasks receive the location of the file in Google Cloud storage through Airflow's XCOM system.
Then, the task merges the unique entries in the file into the corresponding table in BigQuery.
"""
send_ops_to_bq_task = build_gcs_to_bq_task(
dag,
all_history_export_task.task_id,
internal_project,
internal_dataset,
table_names["operations"],
"exported_operations.txt",
partition=True,
cluster=True,
)
send_trades_to_bq_task = build_gcs_to_bq_task(
dag,
all_history_export_task.task_id,
internal_project,
internal_dataset,
table_names["trades"],
"exported_trades.txt",
partition=True,
cluster=True,
)
send_effects_to_bq_task = build_gcs_to_bq_task(
dag,
all_history_export_task.task_id,
internal_project,
internal_dataset,
table_names["effects"],
"exported_effects.txt",
partition=True,
cluster=True,
)
send_txs_to_bq_task = build_gcs_to_bq_task(
dag,
all_history_export_task.task_id,
internal_project,
internal_dataset,
table_names["transactions"],
"exported_transactions.txt",
partition=True,
cluster=True,
)


"""
Load final public dataset, crypto-stellar
"""
send_ops_to_pub_task = build_gcs_to_bq_task(
dag,
all_history_export_task.task_id,
public_project,
public_dataset,
table_names["operations"],
"exported_operations.txt",
partition=True,
cluster=True,
dataset_type="pub",
)
send_trades_to_pub_task = build_gcs_to_bq_task(
dag,
all_history_export_task.task_id,
public_project,
public_dataset,
table_names["trades"],
"exported_trades.txt",
partition=True,
cluster=True,
dataset_type="pub",
)
send_effects_to_pub_task = build_gcs_to_bq_task(
dag,
all_history_export_task.task_id,
public_project,
public_dataset,
table_names["effects"],
"exported_effects.txt",
partition=True,
cluster=True,
dataset_type="pub",
)
send_txs_to_pub_task = build_gcs_to_bq_task(
dag,
all_history_export_task.task_id,
public_project,
public_dataset,
table_names["transactions"],
"exported_transactions.txt",
partition=True,
cluster=True,
dataset_type="pub",
)

"""
Batch loading of derived table, `enriched_history_operations` which denormalizes ledgers, transactions and operations data.
Must wait on history_archive_without_captive_core_dag to finish before beginning the job.
The internal dataset also creates a filtered table, `enriched_meaningful_history_operations` which filters down to only relevant asset ops.
"""
wait_on_dag = build_cross_deps(
dag, "wait_on_ledgers_txs", "history_archive_without_captive_core"
)
insert_enriched_hist_task = build_bq_insert_job(
dag,
internal_project,
internal_dataset,
"enriched_history_operations",
partition=True,
cluster=True,
)
insert_enriched_hist_pub_task = build_bq_insert_job(
dag,
public_project,
public_dataset,
"enriched_history_operations",
partition=True,
cluster=True,
dataset_type="pub",
)
insert_enriched_ma_hist_task = build_bq_insert_job(
dag,
internal_project,
internal_dataset,
"enriched_meaningful_history_operations",
partition=True,
cluster=True,
)

(
time_task
>> write_op_stats
>> all_history_export_task
>> delete_old_op_task
>> send_ops_to_bq_task
>> wait_on_dag
>> delete_enrich_op_task
)
(
delete_enrich_op_task
>> insert_enriched_hist_task
>> delete_enrich_ma_op_task
>> insert_enriched_ma_hist_task
)
(
all_history_export_task
>> delete_old_op_pub_task
>> send_ops_to_pub_task
>> wait_on_dag
>> delete_enrich_op_pub_task
>> insert_enriched_hist_pub_task
)
(
time_task
>> write_trade_stats
>> all_history_export_task
>> delete_old_trade_task
>> send_trades_to_bq_task
)
all_history_export_task >> delete_old_trade_pub_task >> send_trades_to_pub_task
(
time_task
>> write_effects_stats
>> all_history_export_task
>> delete_old_effects_task
>> send_effects_to_bq_task
)
all_history_export_task >> delete_old_effects_pub_task >> send_effects_to_pub_task
(
time_task
>> write_tx_stats
>> all_history_export_task
>> delete_old_tx_task
>> send_txs_to_bq_task
>> wait_on_dag
)
all_history_export_task >> delete_old_tx_pub_task >> send_txs_to_pub_task >> wait_on_dag
(time_task >> write_diagnostic_events_stats >> all_history_export_task)
(
[
insert_enriched_hist_pub_task,
insert_enriched_hist_task,
]
)
Loading
Loading