From e53f511a9160f60e203a28188cfe5b1fd75e63d9 Mon Sep 17 00:00:00 2001 From: Raymond Wiker Date: Thu, 4 Jul 2024 12:33:20 +0200 Subject: [PATCH] Make Table.to_arrow() and Table.to_panda() more robust when there is a mismatch between data.format and blob content. (#339) Co-authored-by: Raymond Wiker --- src/fmu/sumo/explorer/objects/table.py | 176 +++++++++++-------------- 1 file changed, 80 insertions(+), 96 deletions(-) diff --git a/src/fmu/sumo/explorer/objects/table.py b/src/fmu/sumo/explorer/objects/table.py index 03f61132..6a174ce2 100644 --- a/src/fmu/sumo/explorer/objects/table.py +++ b/src/fmu/sumo/explorer/objects/table.py @@ -22,7 +22,49 @@ def __init__(self, sumo: SumoClient, metadata: dict) -> None: self._dataframe = None self._arrowtable = None self._logger = logging.getLogger("__name__" + ".Table") - + self._blob = None + + def _get_blob(self): + if self._blob is None: + self._blob = self.blob + return self._blob + + async def _get_blob_async(self): + if self._blob is None: + self._blob = await self.blob_async + return self._blob + + def _read_table(self): + return self._construct_table_from_blob(self._get_blob()) + + async def _read_table_async(self): + return self._construct_table_from_blob(await self._get_blob_async()) + + def _construct_table_from_blob(self, blob): + try: + if self.dataformat == "csv": + dataframe = pd.read_csv(blob) + elif self.dataformat == "parquet": + dataframe = pd.read_parquet(blob) + elif self.dataformat == "arrow": + dataframe = pf.read_feather(blob) + else: + raise TypeError(f"Don't know how to convert a blob of format {self.dataformat} to a pandas table.") + except Exception as ex0: + try: + dataframe = pd.read_csv(blob) + except Exception as ex: + try: + dataframe = pd.read_parquet(blob) + except Exception as ex: + try: + dataframe = pf.read_feather(blob) + except Exception as ex: + raise TypeError(f"Unable to convert a blob of format {self.dataformat} to pandas table; tried csv, parquet and feather.") + pass + pass + pass + return dataframe def to_pandas(self) -> pd.DataFrame: """Return object as a pandas DataFrame @@ -30,32 +72,8 @@ def to_pandas(self) -> pd.DataFrame: Returns: DataFrame: A DataFrame object """ - if self._dataframe is None: - if self["data"]["format"] == "csv": - worked = "csv" - self._logger.debug("Treating blob as csv") - try: - self._dataframe = pd.read_csv(self.blob) - worked = "csv" - - except UnicodeDecodeError as ud_e: - raise UnicodeDecodeError("Maybe not csv?") from ud_e - else: - try: - worked = "feather" - self._dataframe = pf.read_feather(self.blob) - except pa.lib.ArrowInvalid: - try: - worked = "parquet" - self._dataframe = pd.read_parquet(self.blob) - - except UnicodeDecodeError as ud_error: - raise TypeError( - "Come on, no way this is converting to pandas!!" - ) from ud_error - - self._logger.debug("Read blob as %s to return pandas", worked) + self._dataframe = self._read_table() return self._dataframe async def to_pandas_async(self) -> pd.DataFrame: @@ -64,34 +82,45 @@ async def to_pandas_async(self) -> pd.DataFrame: Returns: DataFrame: A DataFrame object """ - if self._dataframe is None: - if self["data"]["format"] == "csv": - worked = "csv" - self._logger.debug("Treating blob as csv") - try: - self._dataframe = pd.read_csv(await self.blob_async) - worked = "csv" + self._dataframe = await self._read_table_async() + return self._dataframe - except UnicodeDecodeError as ud_e: - raise UnicodeDecodeError("Maybe not csv?") from ud_e + def _read_arrow(self): + return self._construct_arrow_from_blob(self._get_blob()) + + async def _read_arrow_async(self): + return self._construct_arrow_from_blob(await self._get_blob_async()) + + def _construct_arrow_from_blob(self, blob): + try: + if self.dataformat == "csv": + arrowtable = pa.Table.from_pandas( + pd.read_csv(blob) + ) + elif self.dataformat == "parquet": + arrowtable = pq.read_table(blob) + elif self.dataformat == "arrow": + arrowtable = pf.read_table(blob) else: + raise TypeError(f"Don't know how to convert a blob of format {self.dataformat} to a pandas table.") + except Exception as ex0: + try: + arrowtable = pa.Table.from_pandas( + pd.read_csv(blob) + ) + except Exception as ex: try: - worked = "feather" - self._dataframe = pf.read_feather(await self.blob_async) - except pa.lib.ArrowInvalid: + arrowtable = pq.read_table(blob) + except Exception as ex: try: - worked = "parquet" - self._dataframe = pd.read_parquet(await self.blob_async) - - except UnicodeDecodeError as ud_error: - raise TypeError( - "Come on, no way this is converting to pandas!!" - ) from ud_error - - self._logger.debug("Read blob as %s to return pandas", worked) - return self._dataframe - + arrowtable = pf.read_table(selfblob) + except Exception as ex: + raise TypeError(f"Unable to convert a blob of format {self.dataformat} to arrow; tried csv, parquet and feather.") + pass + pass + pass + return arrowtable def to_arrow(self) -> pa.Table: """Return object as an arrow Table @@ -100,31 +129,7 @@ def to_arrow(self) -> pa.Table: pa.Table: _description_ """ if self._arrowtable is None: - if self["data"]["format"] == "parquet": - worked = "parquet" - self._arrowtable = pq.read_table(self.blob) - elif self["data"]["format"] == "arrow": - try: - worked = "feather" - self._arrowtable = pf.read_table(self.blob) - except pa.lib.ArrowInvalid: - worked = "parquet" - self._arrowtable = pq.read_table(self.blob) - else: - warn( - "Reading csv format into arrow, you will not get the full benefit of native arrow" - ) - worked = "csv" - try: - self._arrowtable = pa.Table.from_pandas( - pd.read_csv(self.blob) - ) - - except TypeError as type_err: - raise OSError("Cannot read this into arrow") from type_err - - self._logger.debug("Read blob as %s to return arrow", worked) - + self._arrowtable = self._read_arrow() return self._arrowtable async def to_arrow_async(self) -> pa.Table: @@ -134,26 +139,5 @@ async def to_arrow_async(self) -> pa.Table: pa.Table: _description_ """ if self._arrowtable is None: - if self["data"]["format"] == "arrow": - try: - worked = "feather" - self._arrowtable = pf.read_table(await self.blob_async) - except pa.lib.ArrowInvalid: - worked = "parquet" - self._arrowtable = pq.read_table(await self.blob_async) - else: - warn( - "Reading csv format into arrow, you will not get the full benefit of native arrow" - ) - worked = "csv" - try: - self._arrowtable = pa.Table.from_pandas( - pd.read_csv(await self.blob_async) - ) - - except TypeError as type_err: - raise OSError("Cannot read this into arrow") from type_err - - self._logger.debug("Read blob as %s to return arrow", worked) - + self._arrowtable = await self._read_arrow_async() return self._arrowtable