Fix table read (#180)

* Fix read pandas as csv * Fix read arrow as parquet * Correct return of pd.DataFrame when parquet * Add test_table Readress test, as other tests, checks against existing case
equinor · Jun 8, 2023 · 722f523 · 722f523
1 parent 82f96bb
commit 722f523
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 14 deletions.
diff --git a/src/fmu/sumo/explorer/objects/table.py b/src/fmu/sumo/explorer/objects/table.py
@@ -1,4 +1,5 @@
 """module containing class for table"""
+import logging
 import pandas as pd
 import pyarrow as pa
 import pyarrow.parquet as pq
@@ -7,6 +8,8 @@
 from fmu.sumo.explorer.objects._child import Child
 from warnings import warn
 
+logging.basicConfig(handlers=logging.NullHandler)
+
 
 class Table(Child):
     """Class representing a table object in Sumo"""
@@ -20,6 +23,7 @@ def __init__(self, sumo: SumoClient, metadata: dict) -> None:
         super().__init__(sumo, metadata)
         self._dataframe = None
         self._arrowtable = None
+        self._logger = logging.getLogger("__name__" + ".Table")
 
     @property
     def dataframe(self) -> pd.DataFrame:
@@ -42,22 +46,32 @@ def to_pandas(self) -> pd.DataFrame:
         Returns:
             DataFrame: A DataFrame object
         """
-        if not self._dataframe:
-            try:
-                self._dataframe = pd.read_parquet(self.blob)
 
-            except pa.lib.ArrowInvalid:
+        if self._dataframe is None:
+            if self["data"]["format"] == "csv":
+                worked = "csv"
+                self._logger.debug("Treating blob as csv")
+                try:
+                    self._dataframe = pd.read_csv(self.blob)
+                    worked = "csv"
+
+                except UnicodeDecodeError as ud_e:
+                    raise UnicodeDecodeError("Maybe not csv?") from ud_e
+            else:
                 try:
+                    worked = "feather"
                     self._dataframe = pf.read_feather(self.blob)
                 except pa.lib.ArrowInvalid:
                     try:
-                        self._dataframe = pd.read_csv(self.blob)
+                        worked = "parquet"
+                        self._dataframe = pd.read_parquet(self.blob)
 
                     except UnicodeDecodeError as ud_error:
                         raise TypeError(
                             "Come on, no way this is converting to pandas!!"
                         ) from ud_error
 
+        self._logger.debug("Read blob as %s to return pandas", worked)
         return self._dataframe
 
     @to_pandas.setter
@@ -86,17 +100,27 @@ def to_arrow(self) -> pa.Table:
         Returns:
             pa.Table: _description_
         """
-        if not self._arrowtable:
-            try:
-                self._arrowtable = pq.read_table(self.blob)
-            except pa.lib.ArrowInvalid:
+        if self._arrowtable is None:
+            if self["data"]["format"] == "arrow":
                 try:
+                    worked = "feather"
                     self._arrowtable = pf.read_table(self.blob)
                 except pa.lib.ArrowInvalid:
+                    worked = "parquet"
+                    self._arrowtable = pq.read_table(self.blob)
+            else:
+                warn(
+                    "Reading csv format into arrow, you will not get the full benefit of native arrow"
+                )
+                worked = "csv"
+                try:
                     self._arrowtable = pa.Table.from_pandas(
                         pd.read_csv(self.blob)
                     )
-            except TypeError as type_err:
-                raise OSError("Cannot read this") from type_err
+
+                except TypeError as type_err:
+                    raise OSError("Cannot read this into arrow") from type_err
+
+            self._logger.debug("Read blob as %s to return arrow", worked)
 
         return self._arrowtable
diff --git a/tests/test_aggregated_table.py b/tests/test_aggregated_table.py
@@ -53,9 +53,7 @@ def test_aggregated_summary_arrow_with_deprecated_function_name(
         DeprecationWarning,
         match=".arrowtable is deprecated, renamed to .to_arrow",
     ):
-        column.arrowtable
-
-    assert isinstance(column.arrowtable, pa.Table)
+        assert isinstance(column.arrowtable, pa.Table)
     with pytest.raises(IndexError) as e_info:
         table = table["banana"]
         assert (

diff --git a/tests/test_table.py b/tests/test_table.py
@@ -0,0 +1,65 @@
+from fmu.sumo.explorer import Explorer
+import pytest
+
+
+@pytest.fixture(name="doc", scope="function")
+def _fixture_doc():
+    exp = Explorer("dev")
+    case = exp.get_case_by_uuid("8575ed9a-b0c1-4f64-b9ab-fab9cec31f4a")
+    pvt_table = case.tables.filter(
+        name="SNORRE",
+        tagname="pvt",
+        iteration="iter-0",
+        realization=113,
+    )
+    return pvt_table[0]
+
+
+@pytest.fixture(name="correct_columns", scope="session")
+def _fixture_cols():
+    return [
+        "PRESSURE",
+        "VOLUMEFACTOR",
+        "VISCOSITY",
+        "RS",
+        "PVTNUM",
+        "KEYWORD",
+        "OILDENSITY",
+        "WATERDENSITY",
+        "GASDENSITY",
+        "COMPRESSIBILITY",
+        "VISCOSIBILITY",
+    ]
+
+
+def test_to_pandas_with_csv(doc, correct_columns):
+    """Test method to_pandas
+
+    Args:
+        doc (fmu.sumo.Document): the document to read from
+        correct_columns (list): list of correct columns
+    """
+    returned = doc.to_pandas
+    check_columns = returned.columns.tolist()
+    correct_columns.sort()
+    check_columns.sort()
+    assert (
+        check_columns == correct_columns
+    ), f"Cols should be {correct_columns}, {check_columns}, when csv"
+
+
+def test_to_arrow_with_csv(doc, correct_columns):
+    """Test method to_arrow
+
+    Args:
+        doc (fmu.sumo.Document): the document to read from
+        correct_columns (list): list of correct columns
+    """
+    returned = doc.to_arrow
+    correct_columns.sort()
+    check_columns = returned.column_names
+    check_columns.sort()
+
+    assert (
+        check_columns == correct_columns
+    ), f"Cols should be {correct_columns}, {check_columns}, when arrow"