hotgluexyz · keyn4 · Feb 16, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/gluestick/etl_utils.py b/gluestick/etl_utils.py
@@ -441,6 +441,15 @@ def parse_objs(x):
     # if it's not a string, we just return the input
     if type(x) != str:
         return x
+
+    # if it's a numeric type we don't want to parse it by accident
+    if type(x) == str:
+        try:
+            float(x)
+            return x
+        except:
+            # it failed so it's not a float or int, we can proceed
+            pass
 
     try:
         return ast.literal_eval(x)
@@ -550,19 +559,24 @@ def __str__(self):
     def __repr__(self):
         return str(list(self.input_files.keys()))
 
-    def get(self, stream, default=None, catalog_types=False, **kwargs):
+    def get(self, stream, default=None, catalog_types=False, parse_objects=False, **kwargs):
         """Read the selected file."""
         filepath = self.input_files.get(stream)
+        df = None
         if not filepath:
             return default
         if filepath.endswith(".parquet"):
             import pyarrow.parquet as pq
-            return pq.read_table(filepath).to_pandas(safe=False)
-        catalog = self.read_catalog()
-        if catalog and catalog_types:
-            types_params = self.get_types_from_catalog(catalog, stream)
-            kwargs.update(types_params)
-        return pd.read_csv(filepath, **kwargs)
+            df = pq.read_table(filepath).to_pandas(safe=False)
+        else:
+            catalog = self.read_catalog()
+            if catalog and catalog_types:
+                types_params = self.get_types_from_catalog(catalog, stream)
+                kwargs.update(types_params)
+            df = pd.read_csv(filepath, **kwargs)
+        if parse_objects and df is not None and not df.empty:
+            df = parse_object_cols(df)
+        return df
 
     def get_metadata(self, stream):
         """Get metadata from parquet file."""
@@ -694,6 +708,16 @@ def localize_datetime(df, column_name):
 
     return df[column_name]
 
+
+def parse_object_cols(df):
+    object_columns = [column for column in df.columns if df[column].dtype == 'object']
+    for col in object_columns:
+        try:
+            df[col] = df[col].apply(lambda x: parse_objs(x))
+        except:
+            continue
+    return df
+
 def exception(exception, root_dir, error_message=None):
     """
     Stores an exception and a message into a file errors.txt, 

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="gluestick",
-    version="2.1.21",
+    version="2.1.23",
     description="ETL utility functions built on Pandas",
     long_description=long_description,
     long_description_content_type="text/markdown",