wip

airbnb · Nov 8, 2023 · d960fa6 · d960fa6
1 parent c45a286
commit d960fa6
Showing 2 changed files with 25 additions and 2 deletions.
diff --git a/spark/src/main/scala/ai/chronon/spark/Driver.scala b/spark/src/main/scala/ai/chronon/spark/Driver.scala
@@ -213,6 +213,10 @@ object Driver {
         opt[String](required = false,
                     descr =
                       "Start date to compute join backfill, this start date will override start partition in conf.")
+      val parallelism : ScallopOption[Int] =
+        opt[Int](required = false,
+          descr = "Number of parallel jobs to run, default is 1",
+          default = Option(1))
       lazy val joinConf: api.Join = parseConf[api.Join](confPath())
       override def subcommandName() = s"join_${joinConf.metaData.name}"
     }
@@ -248,6 +252,14 @@ object Driver {
         opt[Int](required = false,
                  descr = "Runs backfill in steps, step-days at a time. Default is 30 days",
                  default = Option(30))
+      val startPartitionOverride: ScallopOption[String] =
+        opt[String](required = false,
+          descr =
+            "Start date to compute group by backfill, this start date will override backfill start date in conf.")
+      val parallelism: ScallopOption[Int] =
+        opt[Int](required = false,
+          descr = "Number of parallel jobs to run, default is 1",
+          default = Option(1))
       lazy val groupByConf: api.GroupBy = parseConf[api.GroupBy](confPath())
       override def subcommandName() = s"groupBy_${groupByConf.metaData.name}_backfill"
     }
@@ -361,6 +373,14 @@ object Driver {
         opt[Boolean](required = false,
                      descr = "Auto expand hive table if new columns added in staging query",
                      default = Option(true))
+      val startPartitionOverride: ScallopOption[String] =
+        opt[String](required = false,
+          descr =
+            "Start date to compute staging query backfill, this start date will override start partition in conf.")
+      val parallelism: ScallopOption[Int] =
+        opt[Int](required = false,
+          descr = "Number of parallel jobs to run, default is 1",
+          default = Option(1))
       lazy val stagingQueryConf: api.StagingQuery = parseConf[api.StagingQuery](confPath())
       override def subcommandName() = s"staging_query_${stagingQueryConf.metaData.name}_backfill"
     }

diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -2,6 +2,7 @@ package ai.chronon.spark
 
 import ai.chronon.api.{Constants, PartitionSpec}
 import ai.chronon.api.Extensions._
+import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, Project}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
@@ -226,19 +227,21 @@ case class TableUtils(sparkSession: SparkSession) {
       try {
         sql(creationSql)
       } catch {
+        case _: TableAlreadyExistsException =>
+          println(s"Table $tableName already exists, skipping creation")
         case e: Exception =>
           println(s"Failed to create table $tableName with error: ${e.getMessage}")
           throw e
       }
-    } else {
+    }
       if (tableProperties != null && tableProperties.nonEmpty) {
         sql(alterTablePropertiesSql(tableName, tableProperties))
       }
 
       if (autoExpand) {
         expandTable(tableName, dfRearranged.schema)
       }
-    }
+
 
     val finalizedDf = if (autoExpand) {
       // reselect the columns so that an deprecated columns will be selected as NULL before write