From 282ea0abdf00d89223c5670cf8979e8caca05b46 Mon Sep 17 00:00:00 2001 From: Daniel Sollien <62246179+daniel-sol@users.noreply.github.com> Date: Fri, 24 May 2024 14:22:14 +0200 Subject: [PATCH] API: Export pyarrow as parquet (#652) --- src/fmu/dataio/_definitions.py | 2 +- src/fmu/dataio/_utils.py | 18 +++++------------- src/fmu/dataio/dataio.py | 2 +- tests/test_units/test_ert_context.py | 2 +- tests/test_units/test_rms_context.py | 2 +- 5 files changed, 9 insertions(+), 17 deletions(-) diff --git a/src/fmu/dataio/_definitions.py b/src/fmu/dataio/_definitions.py index 2e9daa47a..9063f527f 100644 --- a/src/fmu/dataio/_definitions.py +++ b/src/fmu/dataio/_definitions.py @@ -43,7 +43,7 @@ class ValidFormats: default_factory=lambda: { "hdf": ".hdf", "csv": ".csv", - "arrow": ".arrow", + "parquet": ".parquet", } ) polygons: dict[str, str] = field( diff --git a/src/fmu/dataio/_utils.py b/src/fmu/dataio/_utils.py index c60772c53..dd8246522 100644 --- a/src/fmu/dataio/_utils.py +++ b/src/fmu/dataio/_utils.py @@ -146,22 +146,14 @@ def export_file( "preserved unless calling 'reset_index()' on the dataframe." ) obj.to_csv(filename, index=False) - elif filename.suffix == ".arrow": + elif filename.suffix == ".parquet": from pyarrow import Table if isinstance(obj, Table): - from pyarrow import feather - - # comment taken from equinor/webviz_subsurface/smry2arrow.py - # Writing here is done through the feather import, but could also be - # done using pa.RecordBatchFileWriter.write_table() with a few - # pa.ipc.IpcWriteOptions(). It is convenient to use feather since it - # has ready configured defaults and the actual file format is the same - # (https://arrow.apache.org/docs/python/feather.html) - - # Types in pyarrow-stubs package are wrong for the write_feather(...). - # https://arrow.apache.org/docs/python/generated/pyarrow.feather.write_feather.html#pyarrow.feather.write_feather - feather.write_feather(obj, dest=str(filename)) # type: ignore + from pyarrow import parquet + + parquet.write_table(obj, where=str(filename)) + elif filename.suffix == ".json" and isinstance(obj, FaultRoomSurface): with open(filename, "w") as stream: json.dump(obj.storage, stream, indent=4) diff --git a/src/fmu/dataio/dataio.py b/src/fmu/dataio/dataio.py index 9d106cb2c..1aaa30438 100644 --- a/src/fmu/dataio/dataio.py +++ b/src/fmu/dataio/dataio.py @@ -311,7 +311,7 @@ class ExportData: # class variables allow_forcefolder_absolute: ClassVar[bool] = False # deprecated - arrow_fformat: ClassVar[str] = "arrow" + arrow_fformat: ClassVar[str] = "parquet" case_folder: ClassVar[str] = "share/metadata" createfolder: ClassVar[bool] = True # deprecated cube_fformat: ClassVar[str] = "segy" diff --git a/tests/test_units/test_ert_context.py b/tests/test_units/test_ert_context.py index 4fdc846d6..b532d1b16 100644 --- a/tests/test_units/test_ert_context.py +++ b/tests/test_units/test_ert_context.py @@ -438,7 +438,7 @@ def test_pyarrow_export_file_set_name( assert str(output) == str( ( edata._rootpath - / "realization-0/iter-0/share/results/tables/myarrowtable.arrow" + / "realization-0/iter-0/share/results/tables/myarrowtable.parquet" ).resolve() ) diff --git a/tests/test_units/test_rms_context.py b/tests/test_units/test_rms_context.py index 6133cffd1..94913c5d7 100644 --- a/tests/test_units/test_rms_context.py +++ b/tests/test_units/test_rms_context.py @@ -595,7 +595,7 @@ def test_pyarrow_export_file_set_name(rmssetup, rmsglobalconfig, arrowtable): logger.info("Output is %s", output) assert str(output) == str( - (edata._rootpath / "share/results/tables/myarrowtable.arrow").resolve() + (edata._rootpath / "share/results/tables/myarrowtable.parquet").resolve() ) metaout = dataio.read_metadata(output)