Skip to content

Commit

Permalink
CLN: Create ExistingDataProvider directly
Browse files Browse the repository at this point in the history
Rather than keeping existing metadata as an optional argument for all
data providers, if existing metadata exists just create the class
directly from it.
  • Loading branch information
mferrera committed Apr 2, 2024
1 parent 1914659 commit 3897420
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 79 deletions.
61 changes: 26 additions & 35 deletions src/fmu/dataio/providers/_objectdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,42 +133,30 @@ def objectdata_provider_factory(
metadata for.
"""
if meta_existing:
return ExistingDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing)

meta_existing = {}
return ExistingDataProvider.from_metadata_dict(obj, dataio, meta_existing)
if isinstance(obj, xtgeo.RegularSurface):
return RegularSurfaceDataProvider(
obj=obj, dataio=dataio, meta_existing=meta_existing
)
return RegularSurfaceDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, xtgeo.Polygons):
return PolygonsDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing)
return PolygonsDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, xtgeo.Points):
return PointsDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing)
return PointsDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, xtgeo.Cube):
return CubeDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing)
return CubeDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, xtgeo.Grid):
return CPGridDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing)
return CPGridDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, xtgeo.GridProperty):
return CPGridPropertyDataProvider(
obj=obj, dataio=dataio, meta_existing=meta_existing
)
return CPGridPropertyDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, pd.DataFrame):
return DataFrameDataProvider(
obj=obj, dataio=dataio, meta_existing=meta_existing
)
return DataFrameDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, dict):
return DictionaryDataProvider(
obj=obj, dataio=dataio, meta_existing=meta_existing
)
return DictionaryDataProvider(obj=obj, dataio=dataio)

from pyarrow import Table

if isinstance(obj, Table):
return ArrowTableDataProvider(
obj=obj, dataio=dataio, meta_existing=meta_existing
)
return ArrowTableDataProvider(obj=obj, dataio=dataio)

raise NotImplementedError("This data type is not (yet) supported: ", type(obj))
raise NotImplementedError(f"This data type is not currently supported: {type(obj)}")


@dataclass
Expand Down Expand Up @@ -345,30 +333,33 @@ def get_objectdata(self) -> DerivedObjectDescriptor:

@dataclass
class ExistingDataProvider(ObjectDataProvider):
"""These functions should never be called because derive_metadata will populate the
object data from existing metadata, by calling _derive_from_existing, and return
before calling them."""
"""These getters should never be called because metadata was derived a priori."""

obj: Any
obj: Inferrable

def get_spec(self) -> dict[str, Any]:
def get_spec(self) -> dict:
"""Derive data.spec from existing metadata."""
return self.meta_existing["spec"]
return self.metadata["spec"]

def get_bbox(self) -> dict[str, Any]:
def get_bbox(self) -> dict:
"""Derive data.bbox from existing metadata."""
return self.meta_existing["bbox"]
return self.metadata["bbox"]

def get_objectdata(self) -> DerivedObjectDescriptor:
"""Derive object data for existing metadata."""
return DerivedObjectDescriptor(
subtype=self.meta_existing["subtype"],
classname=self.meta_existing["class"],
layout=self.meta_existing["layout"],
subtype=self.metadata["subtype"],
classname=self.metadata["class"],
layout=self.metadata["layout"],
efolder=self.efolder,
fmt=self.meta_existing["format"],
fmt=self.fmt,
extension=self.extension,
spec=self.get_spec(),
bbox=self.get_bbox(),
table_index=None,
)

def derive_metadata(self) -> None:
"""Metadata has already been derived for this provider, and is already set from
instantiation, so override this method and do nothing."""
return
80 changes: 39 additions & 41 deletions src/fmu/dataio/providers/_objectdata_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Final, Literal, Optional, TypeVar
from typing import TYPE_CHECKING, Any, Dict, Final, Literal, Optional, TypeVar
from warnings import warn

from fmu.dataio import dataio, types
Expand All @@ -14,6 +14,10 @@
from fmu.dataio.datastructure._internal.internal import AllowedContent
from fmu.dataio.datastructure.meta import content

if TYPE_CHECKING:
from fmu.dataio.dataio import ExportData
from fmu.dataio.types import Inferrable

logger: Final = null_logger(__name__)

V = TypeVar("V")
Expand Down Expand Up @@ -126,21 +130,16 @@ class ObjectDataProvider(ABC):
# input fields
obj: types.Inferrable
dataio: dataio.ExportData
meta_existing: dict = field(default_factory=dict)

# result properties; the most important is metadata which IS the 'data' part in
# the resulting metadata. But other variables needed later are also given
# as instance properties in addition (for simplicity in other classes/functions)
bbox: dict = field(default_factory=dict)
metadata: dict = field(default_factory=dict)
name: str = field(default="")
classname: str = field(default="")
efolder: str = field(default="")
extension: str = field(default="")
fmt: str = field(default="")
layout: str = field(default="")
metadata: dict = field(default_factory=dict)
name: str = field(default="")
specs: dict = field(default_factory=dict)
subtype: str = field(default="")
time0: str | None = field(default=None)
time1: str | None = field(default=None)

Expand Down Expand Up @@ -255,35 +254,22 @@ def _derive_timedata(self) -> Optional[dict[str, str]]:
mode="json", exclude_none=True
)

def _derive_from_existing(self) -> None:
"""Derive from existing metadata."""

# do not change any items in 'data' block, as it may ruin e.g. stratigrapical
# setting (i.e. changing data.name is not allowed)
self.metadata = self.meta_existing["data"]
self.name = self.meta_existing["data"]["name"]

# derive the additional attributes needed later e.g. in Filedata provider:
relpath = Path(self.meta_existing["file"]["relative_path"])
if self.dataio.subfolder:
self.efolder = relpath.parent.parent.name
else:
self.efolder = relpath.parent.name
@abstractmethod
def get_spec(self) -> dict:
raise NotImplementedError

self.classname = self.meta_existing["class"]
self.extension = relpath.suffix
self.fmt = self.meta_existing["data"]["format"]
@abstractmethod
def get_bbox(self) -> dict:
raise NotImplementedError

self.time0, self.time1 = parse_timedata(self.meta_existing["data"])
@abstractmethod
def get_objectdata(self) -> DerivedObjectDescriptor:
raise NotImplementedError

def derive_metadata(self) -> None:
"""Main function here, will populate the metadata block for 'data'."""
logger.info("Derive all metadata for data object...")

if self.meta_existing:
self._derive_from_existing()
return

namedstratigraphy = self._derive_name_stratigraphy()
objres = self.get_objectdata()
if self.dataio.forcefolder and not self.dataio.forcefolder.startswith("/"):
Expand Down Expand Up @@ -341,14 +327,26 @@ def derive_metadata(self) -> None:
self.fmt = objres.fmt
logger.info("Derive all metadata for data object... DONE")

@abstractmethod
def get_spec(self) -> dict[str, Any]:
raise NotImplementedError

@abstractmethod
def get_bbox(self) -> dict[str, Any]:
raise NotImplementedError

@abstractmethod
def get_objectdata(self) -> DerivedObjectDescriptor:
raise NotImplementedError
@classmethod
def from_metadata_dict(
cls, obj: Inferrable, dataio: ExportData, meta_existing: dict
) -> ObjectDataProvider:
"""Instantiate from existing metadata."""

relpath = Path(meta_existing["file"]["relative_path"])
time0, time1 = parse_timedata(meta_existing["data"])

return cls(
obj=obj,
dataio=dataio,
metadata=meta_existing["data"],
name=meta_existing["data"]["name"],
classname=meta_existing["class"],
efolder=(
relpath.parent.parent.name if dataio.subfolder else relpath.parent.name
),
extension=relpath.suffix,
fmt=meta_existing["data"]["format"],
time0=time0,
time1=time1,
)
81 changes: 78 additions & 3 deletions tests/test_units/test_objectdataprovider_class.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
"""Test the _ObjectData class from the _objectdata.py module"""

import os

import pytest
from fmu.dataio import dataio
from fmu.dataio._definitions import ConfigurationError, ValidFormats
from fmu.dataio.providers._objectdata import objectdata_provider_factory
from fmu.dataio._metadata import MetaData
from fmu.dataio.providers._objectdata import (
ExistingDataProvider,
objectdata_provider_factory,
)
from fmu.dataio.providers._objectdata_xtgeo import RegularSurfaceDataProvider

from ..utils import inside_rms

# --------------------------------------------------------------------------------------
# RegularSurface
Expand Down Expand Up @@ -67,8 +77,10 @@ def test_objectdata_regularsurface_spec_bbox(regsurf, edataobj1):
def test_objectdata_regularsurface_derive_objectdata(regsurf, edataobj1):
"""Derive other properties."""

res = objectdata_provider_factory(regsurf, edataobj1).get_objectdata()
objdata = objectdata_provider_factory(regsurf, edataobj1)
assert isinstance(objdata, RegularSurfaceDataProvider)

res = objdata.get_objectdata()
assert res.subtype == "RegularSurface"
assert res.classname == "surface"
assert res.extension == ".gri"
Expand All @@ -81,5 +93,68 @@ def test_objectdata_regularsurface_derive_metadata(regsurf, edataobj1):
myobj.derive_metadata()
res = myobj.metadata
assert res["content"] == "depth"

assert res["alias"]


def test_objectdata_provider_factory_raises_on_unknown(edataobj1):
with pytest.raises(NotImplementedError, match="not currently supported"):
objectdata_provider_factory(object(), edataobj1)


def test_regsurf_preprocessed_observation(
fmurun_w_casemetadata, rmssetup, rmsglobalconfig, regsurf
):
"""Test generating pre-realization surfaces that comes to share/preprocessed.
Later, a fmu run will update this (merge metadata)
"""

@inside_rms
def _export_data_from_rms(rmssetup, rmsglobalconfig, regsurf):
"""Run an export of a preprocessed surface inside RMS."""

os.chdir(rmssetup)
edata = dataio.ExportData(
config=rmsglobalconfig, # read from global config
fmu_context="preprocessed",
name="TopVolantis",
content="depth",
is_observation=True,
timedata=[[20240802, "moni"], [20200909, "base"]],
)
return edata, edata.export(regsurf)

def _run_case_fmu(fmurun_w_casemetadata, rmsglobalconfig, surfacepath):
"""Run FMU workflow, using the preprocessed data as case data.
When re-using metadata, the input object to dataio shall not be a XTGeo or
Pandas or ... instance, but just a file path (either as string or a pathlib.Path
object). This is because we want to avoid time and resources spent on double
reading e.g. a seismic cube, but rather trigger a file copy action instead.
But it requires that valid metadata for that file is found. The rule for
merging is currently defaulted to "preprocessed".
"""
os.chdir(fmurun_w_casemetadata)

casepath = fmurun_w_casemetadata.parent.parent
edata = dataio.ExportData(
config=rmsglobalconfig,
fmu_context="case",
content=None,
is_observation=True,
)
_ = edata.generate_metadata(
surfacepath,
casepath=casepath,
)
metaobj = MetaData(surfacepath, edata)
metaobj._populate_meta_objectdata()
assert isinstance(metaobj.objdata, ExistingDataProvider)
return metaobj

# run two stage process
edata, mysurf = _export_data_from_rms(rmssetup, rmsglobalconfig, regsurf)
metaobj = _run_case_fmu(fmurun_w_casemetadata, rmsglobalconfig, mysurf)
case_meta = metaobj.generate_export_metadata()
assert edata._metadata["data"] == case_meta["data"]

0 comments on commit 3897420

Please sign in to comment.