Skip to content

Commit

Permalink
Make Table.to_arrow() and Table.to_panda() more robust when there is …
Browse files Browse the repository at this point in the history
…a mismatch between data.format and blob content. (#339)

Co-authored-by: Raymond Wiker <[email protected]>
  • Loading branch information
rwiker and rwiker authored Jul 4, 2024
1 parent 68b8ee0 commit e53f511
Showing 1 changed file with 80 additions and 96 deletions.
176 changes: 80 additions & 96 deletions src/fmu/sumo/explorer/objects/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,40 +22,58 @@ def __init__(self, sumo: SumoClient, metadata: dict) -> None:
self._dataframe = None
self._arrowtable = None
self._logger = logging.getLogger("__name__" + ".Table")

self._blob = None

def _get_blob(self):
if self._blob is None:
self._blob = self.blob
return self._blob

async def _get_blob_async(self):
if self._blob is None:
self._blob = await self.blob_async
return self._blob

def _read_table(self):
return self._construct_table_from_blob(self._get_blob())

async def _read_table_async(self):
return self._construct_table_from_blob(await self._get_blob_async())

def _construct_table_from_blob(self, blob):
try:
if self.dataformat == "csv":
dataframe = pd.read_csv(blob)
elif self.dataformat == "parquet":
dataframe = pd.read_parquet(blob)
elif self.dataformat == "arrow":
dataframe = pf.read_feather(blob)
else:
raise TypeError(f"Don't know how to convert a blob of format {self.dataformat} to a pandas table.")
except Exception as ex0:
try:
dataframe = pd.read_csv(blob)
except Exception as ex:
try:
dataframe = pd.read_parquet(blob)
except Exception as ex:
try:
dataframe = pf.read_feather(blob)
except Exception as ex:
raise TypeError(f"Unable to convert a blob of format {self.dataformat} to pandas table; tried csv, parquet and feather.")
pass
pass
pass
return dataframe

def to_pandas(self) -> pd.DataFrame:
"""Return object as a pandas DataFrame
Returns:
DataFrame: A DataFrame object
"""

if self._dataframe is None:
if self["data"]["format"] == "csv":
worked = "csv"
self._logger.debug("Treating blob as csv")
try:
self._dataframe = pd.read_csv(self.blob)
worked = "csv"

except UnicodeDecodeError as ud_e:
raise UnicodeDecodeError("Maybe not csv?") from ud_e
else:
try:
worked = "feather"
self._dataframe = pf.read_feather(self.blob)
except pa.lib.ArrowInvalid:
try:
worked = "parquet"
self._dataframe = pd.read_parquet(self.blob)

except UnicodeDecodeError as ud_error:
raise TypeError(
"Come on, no way this is converting to pandas!!"
) from ud_error

self._logger.debug("Read blob as %s to return pandas", worked)
self._dataframe = self._read_table()
return self._dataframe

async def to_pandas_async(self) -> pd.DataFrame:
Expand All @@ -64,34 +82,45 @@ async def to_pandas_async(self) -> pd.DataFrame:
Returns:
DataFrame: A DataFrame object
"""

if self._dataframe is None:
if self["data"]["format"] == "csv":
worked = "csv"
self._logger.debug("Treating blob as csv")
try:
self._dataframe = pd.read_csv(await self.blob_async)
worked = "csv"
self._dataframe = await self._read_table_async()
return self._dataframe

except UnicodeDecodeError as ud_e:
raise UnicodeDecodeError("Maybe not csv?") from ud_e
def _read_arrow(self):
return self._construct_arrow_from_blob(self._get_blob())

async def _read_arrow_async(self):
return self._construct_arrow_from_blob(await self._get_blob_async())

def _construct_arrow_from_blob(self, blob):
try:
if self.dataformat == "csv":
arrowtable = pa.Table.from_pandas(
pd.read_csv(blob)
)
elif self.dataformat == "parquet":
arrowtable = pq.read_table(blob)
elif self.dataformat == "arrow":
arrowtable = pf.read_table(blob)
else:
raise TypeError(f"Don't know how to convert a blob of format {self.dataformat} to a pandas table.")
except Exception as ex0:
try:
arrowtable = pa.Table.from_pandas(
pd.read_csv(blob)
)
except Exception as ex:
try:
worked = "feather"
self._dataframe = pf.read_feather(await self.blob_async)
except pa.lib.ArrowInvalid:
arrowtable = pq.read_table(blob)
except Exception as ex:
try:
worked = "parquet"
self._dataframe = pd.read_parquet(await self.blob_async)

except UnicodeDecodeError as ud_error:
raise TypeError(
"Come on, no way this is converting to pandas!!"
) from ud_error

self._logger.debug("Read blob as %s to return pandas", worked)
return self._dataframe

arrowtable = pf.read_table(selfblob)
except Exception as ex:
raise TypeError(f"Unable to convert a blob of format {self.dataformat} to arrow; tried csv, parquet and feather.")
pass
pass
pass
return arrowtable

def to_arrow(self) -> pa.Table:
"""Return object as an arrow Table
Expand All @@ -100,31 +129,7 @@ def to_arrow(self) -> pa.Table:
pa.Table: _description_
"""
if self._arrowtable is None:
if self["data"]["format"] == "parquet":
worked = "parquet"
self._arrowtable = pq.read_table(self.blob)
elif self["data"]["format"] == "arrow":
try:
worked = "feather"
self._arrowtable = pf.read_table(self.blob)
except pa.lib.ArrowInvalid:
worked = "parquet"
self._arrowtable = pq.read_table(self.blob)
else:
warn(
"Reading csv format into arrow, you will not get the full benefit of native arrow"
)
worked = "csv"
try:
self._arrowtable = pa.Table.from_pandas(
pd.read_csv(self.blob)
)

except TypeError as type_err:
raise OSError("Cannot read this into arrow") from type_err

self._logger.debug("Read blob as %s to return arrow", worked)

self._arrowtable = self._read_arrow()
return self._arrowtable

async def to_arrow_async(self) -> pa.Table:
Expand All @@ -134,26 +139,5 @@ async def to_arrow_async(self) -> pa.Table:
pa.Table: _description_
"""
if self._arrowtable is None:
if self["data"]["format"] == "arrow":
try:
worked = "feather"
self._arrowtable = pf.read_table(await self.blob_async)
except pa.lib.ArrowInvalid:
worked = "parquet"
self._arrowtable = pq.read_table(await self.blob_async)
else:
warn(
"Reading csv format into arrow, you will not get the full benefit of native arrow"
)
worked = "csv"
try:
self._arrowtable = pa.Table.from_pandas(
pd.read_csv(await self.blob_async)
)

except TypeError as type_err:
raise OSError("Cannot read this into arrow") from type_err

self._logger.debug("Read blob as %s to return arrow", worked)

self._arrowtable = await self._read_arrow_async()
return self._arrowtable

0 comments on commit e53f511

Please sign in to comment.