diff --git a/integration_tests/models/plugins/spark/spark_external.yml b/integration_tests/models/plugins/spark/spark_external.yml index bd2af2d..82a2af8 100644 --- a/integration_tests/models/plugins/spark/spark_external.yml +++ b/integration_tests/models/plugins/spark/spark_external.yml @@ -41,6 +41,12 @@ sources: columns: *cols-of-the-people tests: *equal-to-the-people + - name: people_csv_partitioned_inferred_using + external: + <<: *csv-people-using + recover_partitions: true + tests: *equal-to-the-people + # ----- TODO: hive format # - name: people_csv_unpartitioned_hive_format diff --git a/macros/plugins/spark/helpers/recover_partitions.sql b/macros/plugins/spark/helpers/recover_partitions.sql index bdc4b22..fee174f 100644 --- a/macros/plugins/spark/helpers/recover_partitions.sql +++ b/macros/plugins/spark/helpers/recover_partitions.sql @@ -1,7 +1,8 @@ {% macro spark__recover_partitions(source_node) %} {# https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-alter-table.html #} - {%- if source_node.external.partitions and source_node.external.using and source_node.external.using|lower != 'delta' -%} + {%- if (source_node.external.partitions or source_node.external.recover_partitions) + and source_node.external.using and source_node.external.using|lower != 'delta' -%} {% set ddl %} ALTER TABLE {{ source(source_node.source_name, source_node.name) }} RECOVER PARTITIONS {% endset %} diff --git a/sample_sources/spark.yml b/sample_sources/spark.yml index 6106cb3..a7c9b95 100644 --- a/sample_sources/spark.yml +++ b/sample_sources/spark.yml @@ -37,3 +37,15 @@ sources: - name: contexts data_type: string description: "Contexts attached to event by Tracker" + +- name: event_inferred_schema + description: "Snowplow events stored as partitioned parquet files in HDFS with inferred schema" + external: + # File path can contain partitions such as: hdfs://.../events/my_partition=2022-03-01/events1.parquet + # These partitions are excluded from 'location'. + location: 'hdfs://.../events/' + using: parquet + + # Setting recover_partitions to true causes partitions to be refreshed, + # even though partitions are not explicitly specified. + recover_partitions: true