From a2d44dba82aae8f5cee2f6f103ed25f9fa623baa Mon Sep 17 00:00:00 2001 From: mferrera Date: Mon, 25 Mar 2024 08:38:47 +0100 Subject: [PATCH] CLN: Create ExistingDataProvider directly Rather than keeping existing metadata as an optional argument for all data providers, if existing metadata exists just create the class directly from it. --- src/fmu/dataio/providers/_objectdata.py | 62 ++++++------- src/fmu/dataio/providers/_objectdata_base.py | 86 ++++++++++--------- .../test_objectdataprovider_class.py | 80 ++++++++++++++++- 3 files changed, 148 insertions(+), 80 deletions(-) diff --git a/src/fmu/dataio/providers/_objectdata.py b/src/fmu/dataio/providers/_objectdata.py index b4e038fc1..5ec7eee47 100644 --- a/src/fmu/dataio/providers/_objectdata.py +++ b/src/fmu/dataio/providers/_objectdata.py @@ -133,42 +133,30 @@ def objectdata_provider_factory( metadata for. """ if meta_existing: - return ExistingDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing) - - meta_existing = {} + return ExistingDataProvider.from_metadata_dict(obj, dataio, meta_existing) if isinstance(obj, xtgeo.RegularSurface): - return RegularSurfaceDataProvider( - obj=obj, dataio=dataio, meta_existing=meta_existing - ) + return RegularSurfaceDataProvider(obj=obj, dataio=dataio) if isinstance(obj, xtgeo.Polygons): - return PolygonsDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing) + return PolygonsDataProvider(obj=obj, dataio=dataio) if isinstance(obj, xtgeo.Points): - return PointsDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing) + return PointsDataProvider(obj=obj, dataio=dataio) if isinstance(obj, xtgeo.Cube): - return CubeDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing) + return CubeDataProvider(obj=obj, dataio=dataio) if isinstance(obj, xtgeo.Grid): - return CPGridDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing) + return CPGridDataProvider(obj=obj, dataio=dataio) if isinstance(obj, xtgeo.GridProperty): - return CPGridPropertyDataProvider( - obj=obj, dataio=dataio, meta_existing=meta_existing - ) + return CPGridPropertyDataProvider(obj=obj, dataio=dataio) if isinstance(obj, pd.DataFrame): - return DataFrameDataProvider( - obj=obj, dataio=dataio, meta_existing=meta_existing - ) + return DataFrameDataProvider(obj=obj, dataio=dataio) if isinstance(obj, dict): - return DictionaryDataProvider( - obj=obj, dataio=dataio, meta_existing=meta_existing - ) + return DictionaryDataProvider(obj=obj, dataio=dataio) from pyarrow import Table if isinstance(obj, Table): - return ArrowTableDataProvider( - obj=obj, dataio=dataio, meta_existing=meta_existing - ) + return ArrowTableDataProvider(obj=obj, dataio=dataio) - raise NotImplementedError("This data type is not (yet) supported: ", type(obj)) + raise NotImplementedError(f"This data type is not currently supported: {type(obj)}") @dataclass @@ -345,30 +333,34 @@ def get_objectdata(self) -> DerivedObjectDescriptor: @dataclass class ExistingDataProvider(ObjectDataProvider): - """These functions should never be called because derive_metadata will populate the - object data from existing metadata, by calling _derive_from_existing, and return - before calling them.""" + """These getters should never be called because metadata was derived a priori.""" - obj: Any + obj: Inferrable - def get_spec(self) -> dict[str, Any]: + def get_spec(self) -> dict: """Derive data.spec from existing metadata.""" - return self.meta_existing["spec"] + return self.metadata["spec"] - def get_bbox(self) -> dict[str, Any]: + def get_bbox(self) -> dict: """Derive data.bbox from existing metadata.""" - return self.meta_existing["bbox"] + return self.metadata["bbox"] def get_objectdata(self) -> DerivedObjectDescriptor: """Derive object data for existing metadata.""" return DerivedObjectDescriptor( - subtype=self.meta_existing["subtype"], - classname=self.meta_existing["class"], - layout=self.meta_existing["layout"], + subtype=self.metadata["subtype"], + classname=self.metadata["class"], + layout=self.metadata["layout"], efolder=self.efolder, - fmt=self.meta_existing["format"], + fmt=self.fmt, extension=self.extension, spec=self.get_spec(), bbox=self.get_bbox(), table_index=None, ) + + def derive_metadata(self) -> None: + """Metadata has already been derived for this provider, and is already set from + instantiation, so override deriving the metadata with his method and do + nothing.""" + return diff --git a/src/fmu/dataio/providers/_objectdata_base.py b/src/fmu/dataio/providers/_objectdata_base.py index 0057392d4..367899fba 100644 --- a/src/fmu/dataio/providers/_objectdata_base.py +++ b/src/fmu/dataio/providers/_objectdata_base.py @@ -4,7 +4,7 @@ from dataclasses import asdict, dataclass, field from datetime import datetime from pathlib import Path -from typing import Any, Dict, Final, Literal, Optional, TypeVar +from typing import TYPE_CHECKING, Any, Dict, Final, Literal, Optional, TypeVar from warnings import warn from fmu.dataio import dataio, types @@ -13,6 +13,10 @@ from fmu.dataio._utils import generate_description, parse_timedata from fmu.dataio.datastructure._internal.internal import AllowedContent +if TYPE_CHECKING: + from fmu.dataio.dataio import ExportData + from fmu.dataio.types import Inferrable + logger: Final = null_logger(__name__) V = TypeVar("V") @@ -136,21 +140,16 @@ class ObjectDataProvider(ABC): # input fields obj: types.Inferrable dataio: dataio.ExportData - meta_existing: dict = field(default_factory=dict) # result properties; the most important is metadata which IS the 'data' part in # the resulting metadata. But other variables needed later are also given # as instance properties in addition (for simplicity in other classes/functions) - bbox: dict = field(default_factory=dict) + metadata: dict = field(default_factory=dict) + name: str = field(default="") classname: str = field(default="") efolder: str = field(default="") extension: str = field(default="") fmt: str = field(default="") - layout: str = field(default="") - metadata: dict = field(default_factory=dict) - name: str = field(default="") - specs: dict = field(default_factory=dict) - subtype: str = field(default="") time0: str = field(default="") time1: str = field(default="") @@ -282,36 +281,22 @@ def _derive_timedata( else TimedataFormat(None, None) ) - def _derive_from_existing(self) -> None: - """Derive from existing metadata.""" - - # do not change any items in 'data' block, as it may ruin e.g. stratigrapical - # setting (i.e. changing data.name is not allowed) - self.metadata = self.meta_existing["data"] - self.name = self.meta_existing["data"]["name"] - - # derive the additional attributes needed later e.g. in Filedata provider: - relpath = Path(self.meta_existing["file"]["relative_path"]) - if self.dataio.subfolder: - self.efolder = relpath.parent.parent.name - else: - self.efolder = relpath.parent.name + @abstractmethod + def get_spec(self) -> dict: + raise NotImplementedError - self.classname = self.meta_existing["class"] - self.extension = relpath.suffix - self.fmt = self.meta_existing["data"]["format"] + @abstractmethod + def get_bbox(self) -> dict: + raise NotImplementedError - # TODO: Clean up types below. - self.time0, self.time1 = parse_timedata(self.meta_existing["data"]) # type: ignore + @abstractmethod + def get_objectdata(self) -> DerivedObjectDescriptor: + raise NotImplementedError def derive_metadata(self) -> None: """Main function here, will populate the metadata block for 'data'.""" logger.info("Derive all metadata for data object...") - if self.meta_existing: - self._derive_from_existing() - return - namedstratigraphy = self._derive_name_stratigraphy() objres = self.get_objectdata() if self.dataio.forcefolder and not self.dataio.forcefolder.startswith("/"): @@ -379,14 +364,31 @@ def derive_metadata(self) -> None: self.fmt = objres.fmt logger.info("Derive all metadata for data object... DONE") - @abstractmethod - def get_spec(self) -> dict[str, Any]: - raise NotImplementedError - - @abstractmethod - def get_bbox(self) -> dict[str, Any]: - raise NotImplementedError - - @abstractmethod - def get_objectdata(self) -> DerivedObjectDescriptor: - raise NotImplementedError + @classmethod + def from_metadata_dict( + cls, obj: Inferrable, dataio: ExportData, meta_existing: dict + ) -> ObjectDataProvider: + """Instantiate from existing metadata.""" + + relpath = Path(meta_existing["file"]["relative_path"]) + + vals = { + "obj": obj, + "dataio": dataio, + "metadata": meta_existing["data"], + "name": meta_existing["data"]["name"], + "classname": meta_existing["class"], + "efolder": ( + relpath.parent.parent.name if dataio.subfolder else relpath.parent.name + ), + "extension": relpath.suffix, + "fmt": meta_existing["data"]["format"], + } + + time0, time1 = parse_timedata(meta_existing["data"]) + if time0: + vals["time0"] = time0 + if time1: + vals["time1"] = time1 + + return cls(**vals) diff --git a/tests/test_units/test_objectdataprovider_class.py b/tests/test_units/test_objectdataprovider_class.py index c7fb71890..0437ad9ab 100644 --- a/tests/test_units/test_objectdataprovider_class.py +++ b/tests/test_units/test_objectdataprovider_class.py @@ -1,8 +1,18 @@ """Test the _ObjectData class from the _objectdata.py module""" +import os + import pytest +from fmu.dataio import dataio from fmu.dataio._definitions import ConfigurationError, ValidFormats -from fmu.dataio.providers._objectdata import objectdata_provider_factory +from fmu.dataio._metadata import MetaData +from fmu.dataio.providers._objectdata import ( + ExistingDataProvider, + objectdata_provider_factory, +) +from fmu.dataio.providers._objectdata_xtgeo import RegularSurfaceDataProvider + +from ..utils import inside_rms # -------------------------------------------------------------------------------------- # RegularSurface @@ -67,8 +77,10 @@ def test_objectdata_regularsurface_spec_bbox(regsurf, edataobj1): def test_objectdata_regularsurface_derive_objectdata(regsurf, edataobj1): """Derive other properties.""" - res = objectdata_provider_factory(regsurf, edataobj1).get_objectdata() + objdata = objectdata_provider_factory(regsurf, edataobj1) + assert isinstance(objdata, RegularSurfaceDataProvider) + res = objdata.get_objectdata() assert res.subtype == "RegularSurface" assert res.classname == "surface" assert res.extension == ".gri" @@ -81,5 +93,67 @@ def test_objectdata_regularsurface_derive_metadata(regsurf, edataobj1): myobj.derive_metadata() res = myobj.metadata assert res["content"] == "depth" - assert res["alias"] + + +def test_objectdata_provider_factory_raises_on_unknown(edataobj1): + with pytest.raises(NotImplementedError, match="not currently supported"): + objectdata_provider_factory(object(), edataobj1) + + +def test_regsurf_preprocessed_observation( + fmurun_w_casemetadata, rmssetup, rmsglobalconfig, regsurf +): + """Test generating pre-realization surfaces that comes to share/preprocessed. + + Later, a fmu run will update this (merge metadata) + """ + + @inside_rms + def _export_data_from_rms(rmssetup, rmsglobalconfig, regsurf): + """Run an export of a preprocessed surface inside RMS.""" + + os.chdir(rmssetup) + edata = dataio.ExportData( + config=rmsglobalconfig, # read from global config + fmu_context="preprocessed", + name="TopVolantis", + content="depth", + is_observation=True, + timedata=[[20240802, "moni"], [20200909, "base"]], + ) + return edata, edata.export(regsurf) + + def _run_case_fmu(fmurun_w_casemetadata, rmsglobalconfig, surfacepath): + """Run FMU workflow, using the preprocessed data as case data. + + When re-using metadata, the input object to dataio shall not be a XTGeo or + Pandas or ... instance, but just a file path (either as string or a pathlib.Path + object). This is because we want to avoid time and resources spent on double + reading e.g. a seismic cube, but rather trigger a file copy action instead. + + But it requires that valid metadata for that file is found. The rule for + merging is currently defaulted to "preprocessed". + """ + os.chdir(fmurun_w_casemetadata) + + casepath = fmurun_w_casemetadata.parent.parent + edata = dataio.ExportData( + config=rmsglobalconfig, + fmu_context="case", + content=None, + is_observation=True, + ) + _ = edata.generate_metadata( + surfacepath, + casepath=casepath, + ) + metaobj = MetaData(surfacepath, edata) + metaobj._populate_meta_objectdata() + assert isinstance(metaobj.objdata, ExistingDataProvider) + return metaobj + + # run two stage process + edata, mysurf = _export_data_from_rms(rmssetup, rmsglobalconfig, regsurf) + metaobj = _run_case_fmu(fmurun_w_casemetadata, rmsglobalconfig, mysurf) + assert edata._metadata["data"] == metaobj.generate_export_metadata()["data"]