From fbc766f94dd0ed52fbfba0e526c7de8d8de57e40 Mon Sep 17 00:00:00 2001 From: Jarno Rajala Date: Tue, 1 Mar 2022 09:00:05 +0200 Subject: [PATCH] Recover partitions for Spark Add 'recover_partitions' option for Spark to run ALTER TABLE RECOVER PARTITIONS even if partitions are not explicitly specified. This makes using inferred partitions possible. --- .../models/plugins/spark/spark_external.yml | 6 ++++++ macros/plugins/spark/helpers/recover_partitions.sql | 2 +- sample_sources/spark.yml | 12 ++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/integration_tests/models/plugins/spark/spark_external.yml b/integration_tests/models/plugins/spark/spark_external.yml index bd2af2df..82a2af80 100644 --- a/integration_tests/models/plugins/spark/spark_external.yml +++ b/integration_tests/models/plugins/spark/spark_external.yml @@ -41,6 +41,12 @@ sources: columns: *cols-of-the-people tests: *equal-to-the-people + - name: people_csv_partitioned_inferred_using + external: + <<: *csv-people-using + recover_partitions: true + tests: *equal-to-the-people + # ----- TODO: hive format # - name: people_csv_unpartitioned_hive_format diff --git a/macros/plugins/spark/helpers/recover_partitions.sql b/macros/plugins/spark/helpers/recover_partitions.sql index 2d20212b..a79de3e2 100644 --- a/macros/plugins/spark/helpers/recover_partitions.sql +++ b/macros/plugins/spark/helpers/recover_partitions.sql @@ -2,7 +2,7 @@ {# https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-alter-table.html #} {% set ddl %} - {%- if source_node.external.partitions and source_node.external.using and source_node.external.using|lower != 'delta' -%} + {%- if (source_node.external.partitions or source_node.external.recover_partitions) and source_node.external.using and source_node.external.using|lower != 'delta' -%} ALTER TABLE {{ source(source_node.source_name, source_node.name) }} RECOVER PARTITIONS {%- endif -%} {% endset %} diff --git a/sample_sources/spark.yml b/sample_sources/spark.yml index 658e198c..5e49b6fa 100644 --- a/sample_sources/spark.yml +++ b/sample_sources/spark.yml @@ -30,3 +30,15 @@ sources: - name: contexts data_type: string description: "Contexts attached to event by Tracker" + +- name: event_inferred_schema + description: "Snowplow events stored as partitioned parquet files in HDFS with inferred schema" + external: + # File path can contain partitions such as: hdfs://.../events/my_partition=2022-03-01/events1.parquet + # These partitions are excluded from 'location'. + location: 'hdfs://.../events/' + using: parquet + + # Setting recover_partitions to true causes partitions to be refreshed, + # even though partitions are not explicitly specified. + recover_partitions: true