wfau · Jun 29, 2022 · Jun 29, 2022 · Jun 29, 2022 · Jul 5, 2022 · Jul 7, 2022
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/gaiadmpconf/conf.py b/gaiadmpconf/conf.py
@@ -0,0 +1 @@
+GAIA_DATA_LOCATION = 'file:///data/gaia/'
diff --git a/gaiadmpsetup/gaiadmpsetup.py b/gaiadmpsetup/gaiadmpsetup.py
@@ -1,9 +1,29 @@
 from pyspark.sql.types import *
 from pyspark.sql.session import SparkSession
+from pyspark.sql.utils import AnalysisException
 
 from . import gaiaedr3_pyspark_schema_structures as edr3
 from . import gaiadr3_pyspark_schema_structures as dr3
 from .gaiadmpstore import *
+from gaiadmpconf import conf
+
+from urllib.parse import urlsplit, unquote_plus
+from pathlib import Path
+
+spark = SparkSession.builder.getOrCreate()
+
+class Location(object):
+
+    def __init__(self, url):
+        (scheme, net_loc, path, _, _) = urlsplit(url)
+        path = Path(unquote_plus(path))
+        self.parts = (scheme, net_loc, path)
+
+    def __eq__(self, other):
+        return self.parts == other.parts
+
+    def __hash__(self):
+        return hash(self.parts)
 
 spark = SparkSession.builder.getOrCreate()
 
@@ -12,43 +32,60 @@ class GaiaDMPSetup:
     Prepare the PySpark env for GaiaDMP
     """
 
+    databases = {
+        'gaiaedr3': edr3,
+        'gaiadr3': dr3,
+    }
+
     def __init__(self):
         pass
 
     @staticmethod
     def setup():
 
-        def tablesExist():
-            actual_tables = [i.name for i in spark.catalog.listTables()]
-            expected_tables = edr3.table_dict.keys() # | dr3.table_dict.keys() TODO add in DR3 tables expected once loaded
-            check =  all(item in actual_tables for item in expected_tables)
-            return check
-
-        if not tablesExist():
-            # database name to create
-            database = "gaiaedr3"
+        data_store = conf.GAIA_DATA_LOCATION
 
-            # create the database and switch the current SQL database context to it (from default)
-            spark.sql("create database " + database)
-            spark.sql("use " + database)
+        def tablesExist(expected_tables, database):
+
+            check = False
+
+            try:
+
+                spark.sql("use " + database)
+                actual_tables = [i.name for i in spark.catalog.listTables()]
+                check =  all(item in actual_tables for item in expected_tables)
+
+            except AnalysisException: pass
+
+            return check
 
-            # create the tables against their corresponding file sets and schema
-            for table_key in edr3.table_dict.keys():
-                folder_path = edr3.table_dict[table_key][1]
-                schemas = edr3.table_dict[table_key][0]
-                reattachParquetFileResourceToSparkContext(table_key, data_store + folder_path, schemas)
+        def location_changed(expected_location, schema):
+            check = False
+            for table_key in schema.table_dict.keys():
+                location = spark.sql(f"desc formatted {table_key}").filter("col_name=='Location'").collect()[0].data_type
+                folder_path = schema.table_dict[table_key][1]
+                if Location(location) != Location(expected_location + folder_path):
+                    check = True
+                # only compare the first table, assuming they are all the same
+                break
+            return check
 
-            # ... similarly for Gaia DR3
-            database = "gaiadr3"
-            spark.sql("create database " + database)
-            spark.sql("use " + database)
+        for database, schema in GaiaDMPSetup.databases.items():
+            if not tablesExist(schema.table_dict.keys(), database) or location_changed(data_store, schema):
 
-            # TODO create the tables against their corresponding file sets and schema            
-            #for table_key in dr3.table_dict.keys():
-            #    folder_path = dr3.table_dict[table_key][0]
-            #    schema = dr3.table_dict[table_key][1]
-            #    reattachParquetFileResourceToSparkContext(table_key, data_store + folder_path, *schema)
+                # create the database and switch the current SQL database context to it (from default)
+                spark.sql("create database if not exists " + database)
+                spark.sql("use " + database)
 
+                # create the tables against their corresponding file sets and schema
+                for table_key in schema.table_dict.keys():
+                    folder_path = schema.table_dict[table_key][1]
+                    schemas = schema.table_dict[table_key][0]
+                    pk = schema.table_dict[table_key][2]
+                    reattachParquetFileResourceToSparkContext(table_key, data_store + folder_path, schemas, cluster_key = pk, sort_key = pk)
+
+        # finally always leave the PySpark SQL context in the most recent Gaia DR3 database
+        spark.sql("use gaiadr3")
 
 GaiaDMPSetup.setup()
 
diff --git a/gaiadmpsetup/gaiadmpstore.py b/gaiadmpsetup/gaiadmpstore.py
@@ -8,9 +8,6 @@
 
 spark = SparkSession.builder.getOrCreate()
 
-# root data store path: TODO change this to the official one when established.
-data_store = "file:////data/gaia/"  # "file:////user/nch/PARQUET/REPARTITIONED/"
-
 # default key by which to bucket and sort: Gaia catalogue source UID 
 default_key = "source_id"
 
@@ -43,7 +40,7 @@ def saveToBinnedParquet(df, outputParquetPath, name, mode = "error", buckets = N
             .saveAsTable(name)
 
 def reattachParquetFileResourceToSparkContext(table_name, file_path, schema_structures, cluster_key = default_key, sort_key = default_key, buckets = NUM_BUCKETS):
-	"""
+	'''
 	Creates a Spark (in-memory) meta-record for the table resource specified for querying
 	through the PySpark SQL API.
 
@@ -69,7 +66,7 @@ def reattachParquetFileResourceToSparkContext(table_name, file_path, schema_stru
 	    Default is Gaia catalogue source UID (= source_id).
 	buckets : int (optional)
 	    Number of buckets into which the data is organised.
-	"""
+	'''
 
 	# put in the columns and their data types ...
 	table_create_statement = "CREATE TABLE `" + table_name + "` ("
@@ -93,6 +90,8 @@ def reattachParquetFileResourceToSparkContext(table_name, file_path, schema_stru
 	# create the table resource
 	spark.sql(table_create_statement)
 
+import copy
+
 def create_interim_schema_for_csv(schema_structure):
     '''
     Takes a schema StructType() and substitutes all array types as a string in order
@@ -115,13 +114,14 @@ def create_interim_schema_for_csv(schema_structure):
 
     # iterate over the schema, copying in everything and substituting strings for any arrays
     for field in schema_structure:
-        if type(field.dataType) == ArrayType: field.dataType = StringType()
-        interim_structure.add(field)
+        interim_field = copy.deepcopy(field)
+        if type(interim_field.dataType) == ArrayType: interim_field.dataType = StringType()
+        interim_structure.add(interim_field)
 
     return interim_structure
 
 def cast_to_array(data_frame : DataFrame, column_name : str, data_type : DataType):
-    """
+    '''
     Casts the specified string column in the given data frame into an
     array with the specified data type. Assumes the string column contains
     comma-separated values in plain text delimited by braces (which are
@@ -143,13 +143,17 @@ def cast_to_array(data_frame : DataFrame, column_name : str, data_type : DataTyp
     Returns:
     --------
     a new data frame containing the requested modification
-    """
+    '''
 
     # a temporary working column name for the array
     temporary_column_name = column_name + '_array_data'
 
     # reformat the string csv data as an array of the specified type
-    data_frame = data_frame.withColumn(temporary_column_name, f.split(f.col(column_name).substr(f.lit(2), f.length(f.col(column_name)) - 2), ',').cast(data_type))
+    if isinstance(data_type.elementType, BooleanType):
+        # ... need to allow for the double quoted boolean labels
+        data_frame = data_frame.withColumn(temporary_column_name, f.split(f.col(column_name).substr(f.lit(3), f.length(f.col(column_name)) - 3), '","').cast(data_type))
+    else:
+        data_frame = data_frame.withColumn(temporary_column_name, f.split(f.col(column_name).substr(f.lit(2), f.length(f.col(column_name)) - 2), ',').cast(data_type))
 
     # drop the original string column to save space
     data_frame = data_frame.drop(column_name)
@@ -161,7 +165,7 @@ def cast_to_array(data_frame : DataFrame, column_name : str, data_type : DataTyp
 
 
 def reorder_columns(data_frame : DataFrame, data_structure : StructType):
-    """
+    '''
     Reorder the columns according to the Gaia archive public schema and so that
     the parquet files can be re-attached against that standard schema.
     
@@ -171,7 +175,11 @@ def reorder_columns(data_frame : DataFrame, data_structure : StructType):
         The PySpark data frame instance to be operated on
     data_structure : StructType()
         The PySpark data structure containing the required schema definition
-    """
+        
+    Returns:
+    --------
+    a new data frame with columns re-ordered according to that in the schema
+    '''
 
     # use the schema to define the column order
     ordered_columns = [field.name for field in data_structure]
@@ -181,7 +189,7 @@ def reorder_columns(data_frame : DataFrame, data_structure : StructType):
 
 
 def cast_all_arrays(data_frame : DataFrame, data_structure : StructType):
-    """
+    '''
     Given an interim data frame read from csv and containing arrays in
     plain text string representation, cycles over the schema transforming
     all strings associated with arrays into the required primitive type.
@@ -192,7 +200,11 @@ def cast_all_arrays(data_frame : DataFrame, data_structure : StructType):
         The PySpark data frame instance to be operated on
     data_structure : StructType()
         The PySpark data structure containing the required schema definition
-    """
+        
+    Returns:
+    --------
+    a new data frame with all arrays expressed as interim strings cast to their array structure type
+    '''
 
     # cycle over the defined fields looking for arrays
     for field in data_structure: