diff --git a/gluestick/etl_utils.py b/gluestick/etl_utils.py index 5d278e8..6764b28 100644 --- a/gluestick/etl_utils.py +++ b/gluestick/etl_utils.py @@ -441,6 +441,15 @@ def parse_objs(x): # if it's not a string, we just return the input if type(x) != str: return x + + # if it's a numeric type we don't want to parse it by accident + if type(x) == str: + try: + float(x) + return x + except: + # it failed so it's not a float or int, we can proceed + pass try: return ast.literal_eval(x) @@ -550,19 +559,24 @@ def __str__(self): def __repr__(self): return str(list(self.input_files.keys())) - def get(self, stream, default=None, catalog_types=False, **kwargs): + def get(self, stream, default=None, catalog_types=False, parse_objects=False, **kwargs): """Read the selected file.""" filepath = self.input_files.get(stream) + df = None if not filepath: return default if filepath.endswith(".parquet"): import pyarrow.parquet as pq - return pq.read_table(filepath).to_pandas(safe=False) - catalog = self.read_catalog() - if catalog and catalog_types: - types_params = self.get_types_from_catalog(catalog, stream) - kwargs.update(types_params) - return pd.read_csv(filepath, **kwargs) + df = pq.read_table(filepath).to_pandas(safe=False) + else: + catalog = self.read_catalog() + if catalog and catalog_types: + types_params = self.get_types_from_catalog(catalog, stream) + kwargs.update(types_params) + df = pd.read_csv(filepath, **kwargs) + if parse_objects and df is not None and not df.empty: + df = parse_object_cols(df) + return df def get_metadata(self, stream): """Get metadata from parquet file.""" @@ -694,6 +708,16 @@ def localize_datetime(df, column_name): return df[column_name] + +def parse_object_cols(df): + object_columns = [column for column in df.columns if df[column].dtype == 'object'] + for col in object_columns: + try: + df[col] = df[col].apply(lambda x: parse_objs(x)) + except: + continue + return df + def exception(exception, root_dir, error_message=None): """ Stores an exception and a message into a file errors.txt, diff --git a/setup.py b/setup.py index d065bb0..7f6c527 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="gluestick", - version="2.1.21", + version="2.1.23", description="ETL utility functions built on Pandas", long_description=long_description, long_description_content_type="text/markdown",