Skip to content

Commit

Permalink
Issue #114/#200 support promoting feature properties to cube values
Browse files Browse the repository at this point in the history
- flattening: move options to export phase iso vector cube constructor
- introduce `VectorCube.from_geodataframe` wiith support for promoting selected columns to cube values
- regardless of promotion: all properties are still associated with `VectorCube.geometries` for now (otherwise properties can not be preserved when using `aggregate_spatial`, see Open-EO/openeo-api#504)
- only promote numerical values by default for now
  • Loading branch information
soxofaan committed Aug 2, 2023
1 parent 4085ea5 commit 082b812
Show file tree
Hide file tree
Showing 4 changed files with 193 additions and 70 deletions.
120 changes: 98 additions & 22 deletions openeo_driver/datacube.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import io

import geopandas as gpd
import numpy as np
import numpy
import pyproj
import shapely.geometry
import shapely.geometry.base
Expand Down Expand Up @@ -218,12 +218,13 @@ class DriverVectorCube:
DIM_GEOMETRIES = "geometries"
DIM_BANDS = "bands"
FLATTEN_PREFIX = "vc"
COLUMN_SELECTION_ALL = "all"
COLUMN_SELECTION_NUMERICAL = "numerical"

def __init__(
self,
geometries: gpd.GeoDataFrame,
cube: Optional[xarray.DataArray] = None,
flatten_prefix: str = FLATTEN_PREFIX,
):
"""
Expand All @@ -237,18 +238,77 @@ def __init__(
log.error(f"First cube dim should be {self.DIM_GEOMETRIES!r} but got dims {cube.dims!r}")
raise VectorCubeError("Cube's first dimension is invalid.")
if not geometries.index.equals(cube.indexes[cube.dims[0]]):
log.error(f"Invalid VectorCube components {geometries.index!r} != {cube.indexes[cube.dims[0]]!r}")
log.error(f"Invalid VectorCube components {geometries.index=} != {cube.indexes[cube.dims[0]]=}")
raise VectorCubeError("Incompatible vector cube components")
self._geometries: gpd.GeoDataFrame = geometries
self._cube = cube
self._flatten_prefix = flatten_prefix

def with_cube(self, cube: xarray.DataArray, flatten_prefix: str = FLATTEN_PREFIX) -> "DriverVectorCube":
def with_cube(self, cube: xarray.DataArray) -> "DriverVectorCube":
"""Create new vector cube with same geometries but new cube"""
log.info(f"Creating vector cube with new cube {cube.name!r}")
return type(self)(
geometries=self._geometries, cube=cube, flatten_prefix=flatten_prefix
)
return type(self)(geometries=self._geometries, cube=cube)

@classmethod
def from_geodataframe(
cls,
data: gpd.GeoDataFrame,
*,
columns_for_cube: Union[List[str], str] = COLUMN_SELECTION_NUMERICAL,
dimension_name: str = DIM_BANDS,
) -> "DriverVectorCube":
"""
Build a DriverVectorCube from given GeoPandas data frame,
using the data frame geometries as vector cube geometries
and other columns (as specified) as cube values along a "bands" dimension
:param data: geopandas data frame
:param columns_for_cube: which data frame columns to use as cube values.
One of:
- "numerical": automatically pick numerical columns
- "all": use all columns as cube values
- list of column names
:param dimension_name: name of the "bands" dimension
:return: vector cube
"""
available_columns = [c for c in data.columns if c != "geometry"]

if columns_for_cube is None:
# TODO #114: what should default selection be?
columns_for_cube = cls.COLUMN_SELECTION_NUMERICAL

if columns_for_cube == cls.COLUMN_SELECTION_NUMERICAL:
columns_for_cube = [c for c in available_columns if numpy.issubdtype(data[c].dtype, numpy.number)]
elif columns_for_cube == cls.COLUMN_SELECTION_ALL:
columns_for_cube = available_columns
elif isinstance(columns_for_cube, list):
# TODO #114 limit to subset with available columns (and automatically fill in missing columns with nodata)?
columns_for_cube = columns_for_cube
else:
raise ValueError(columns_for_cube)
assert isinstance(columns_for_cube, list)

if columns_for_cube:
cube_df = data[columns_for_cube]
# TODO: remove `columns_for_cube` from geopandas data frame?
# Enabling that triggers failure of som existing tests that use `aggregate_spatial`
# to "enrich" a vector cube with pre-existing properties
# Also see https://github.com/Open-EO/openeo-api/issues/504
# geometries_df = data.drop(columns=columns_for_cube)
geometries_df = data

# TODO: leverage pandas `to_xarray` and xarray `to_array` instead of this manual building?
cube: xarray.DataArray = xarray.DataArray(
data=cube_df.values,
dims=[cls.DIM_GEOMETRIES, dimension_name],
coords={
cls.DIM_GEOMETRIES: data.geometry.index.to_list(),
dimension_name: cube_df.columns,
},
)
return cls(geometries=geometries_df, cube=cube)

else:
return cls(geometries=data)

@classmethod
def from_fiona(
Expand All @@ -261,15 +321,21 @@ def from_fiona(
if len(paths) != 1:
# TODO #114 EP-3981: support multiple paths
raise FeatureUnsupportedException(message="Loading a vector cube from multiple files is not supported")
columns_for_cube = options.get("columns_for_cube", cls.COLUMN_SELECTION_NUMERICAL)
# TODO #114 EP-3981: lazy loading like/with DelayedVector
# note for GeoJSON: will consider Feature.id as well as Feature.properties.id
if "parquet" == driver:
return cls.from_parquet(paths=paths)
return cls.from_parquet(paths=paths, columns_for_cube=columns_for_cube)
else:
return cls(geometries=gpd.read_file(paths[0], driver=driver))
gdf = gpd.read_file(paths[0], driver=driver)
return cls.from_geodataframe(gdf, columns_for_cube=columns_for_cube)

@classmethod
def from_parquet(cls, paths: List[Union[str, Path]]):
def from_parquet(
cls,
paths: List[Union[str, Path]],
columns_for_cube: Union[List[str], str] = COLUMN_SELECTION_NUMERICAL,
):
if len(paths) != 1:
# TODO #114 EP-3981: support multiple paths
raise FeatureUnsupportedException(
Expand All @@ -287,10 +353,14 @@ def from_parquet(cls, paths: List[Union[str, Path]]):
if "OGC:CRS84" in str(df.crs) or "WGS 84 (CRS84)" in str(df.crs):
# workaround for not being able to decode ogc:crs84
df.crs = CRS.from_epsg(4326)
return cls(geometries=df)
return cls.from_geodataframe(df, columns_for_cube=columns_for_cube)

@classmethod
def from_geojson(cls, geojson: dict) -> "DriverVectorCube":
def from_geojson(
cls,
geojson: dict,
columns_for_cube: Union[List[str], str] = COLUMN_SELECTION_NUMERICAL,
) -> "DriverVectorCube":
"""Construct vector cube from GeoJson dict structure"""
validate_geojson_coordinates(geojson)
# TODO support more geojson types?
Expand All @@ -308,7 +378,8 @@ def from_geojson(cls, geojson: dict) -> "DriverVectorCube":
raise FeatureUnsupportedException(
f"Can not construct DriverVectorCube from {geojson.get('type', type(geojson))!r}"
)
return cls(geometries=gpd.GeoDataFrame.from_features(features))
gdf = gpd.GeoDataFrame.from_features(features)
return cls.from_geodataframe(gdf, columns_for_cube=columns_for_cube)

@classmethod
def from_geometry(
Expand All @@ -323,7 +394,9 @@ def from_geometry(
geometry = [geometry]
return cls(geometries=gpd.GeoDataFrame(geometry=geometry))

def _as_geopandas_df(self) -> gpd.GeoDataFrame:
def _as_geopandas_df(
self, flatten_prefix: Optional[str] = None, flatten_name_joiner: str = "~"
) -> gpd.GeoDataFrame:
"""Join geometries and cube as a geopandas dataframe"""
# TODO: avoid copy?
df = self._geometries.copy(deep=True)
Expand All @@ -334,18 +407,19 @@ def _as_geopandas_df(self) -> gpd.GeoDataFrame:
if self._cube.dims[1:]:
stacked = self._cube.stack(prop=self._cube.dims[1:])
log.info(f"Flattened cube component of vector cube to {stacked.shape[1]} properties")
name_prefix = [flatten_prefix] if flatten_prefix else []
for p in stacked.indexes["prop"]:
name = "~".join(str(x) for x in [self._flatten_prefix] + list(p))
name = flatten_name_joiner.join(str(x) for x in name_prefix + list(p))
# TODO: avoid column collisions?
df[name] = stacked.sel(prop=p)
else:
df[self._flatten_prefix] = self._cube
df[flatten_prefix or self.FLATTEN_PREFIX] = self._cube

return df

def to_geojson(self) -> dict:
def to_geojson(self, flatten_prefix: Optional[str] = None) -> dict:
"""Export as GeoJSON FeatureCollection."""
return shapely.geometry.mapping(self._as_geopandas_df())
return shapely.geometry.mapping(self._as_geopandas_df(flatten_prefix=flatten_prefix))

def to_wkt(self) -> List[str]:
wkts = [str(g) for g in self._geometries.geometry]
Expand All @@ -369,7 +443,8 @@ def write_assets(
)
return self.to_legacy_save_result().write_assets(directory)

self._as_geopandas_df().to_file(path, driver=format_info.fiona_driver)
gdf = self._as_geopandas_df(flatten_prefix=options.get("flatten_prefix"))
gdf.to_file(path, driver=format_info.fiona_driver)

if not format_info.multi_file:
# single file format
Expand Down Expand Up @@ -474,8 +549,9 @@ def get_xarray_cube_basics(self) -> Tuple[tuple, dict]:
return dims, coords

def __eq__(self, other):
return (isinstance(other, DriverVectorCube)
and np.array_equal(self._as_geopandas_df().values, other._as_geopandas_df().values))
return isinstance(other, DriverVectorCube) and numpy.array_equal(
self._as_geopandas_df().values, other._as_geopandas_df().values
)

def fit_class_random_forest(
self,
Expand Down
2 changes: 1 addition & 1 deletion openeo_driver/dummy/dummy_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def assert_polygon_sequence(geometries: Union[Sequence, BaseMultipartGeometry])
coords=coords,
name="aggregate_spatial",
)
return geometries.with_cube(cube=cube, flatten_prefix="agg")
return geometries.with_cube(cube=cube)
elif isinstance(geometries, str):
geometries = [geometry for geometry in DelayedVector(geometries).geometries]
n_geometries = assert_polygon_sequence(geometries)
Expand Down
97 changes: 65 additions & 32 deletions tests/test_vectorcube.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,37 +92,70 @@ def test_with_cube_to_geojson(self, gdf):
dims += ("bands",)
coords["bands"] = ["red", "green"]
cube = xarray.DataArray(data=[[1, 2], [3, 4]], dims=dims, coords=coords)
vc2 = vc1.with_cube(cube, flatten_prefix="bandz")
assert vc1.to_geojson() == DictSubSet({
"type": "FeatureCollection",
"features": [
DictSubSet({
"type": "Feature",
"geometry": {"type": "Polygon", "coordinates": (((1, 1), (3, 1), (2, 3), (1, 1)),)},
"properties": {"id": "first", "pop": 1234},
}),
DictSubSet({
"type": "Feature",
"geometry": {"type": "Polygon", "coordinates": (((4, 2), (5, 4), (3, 4), (4, 2)),)},
"properties": {"id": "second", "pop": 5678},
}),
]
})
assert vc2.to_geojson() == DictSubSet({
"type": "FeatureCollection",
"features": [
DictSubSet({
"type": "Feature",
"geometry": {"type": "Polygon", "coordinates": (((1, 1), (3, 1), (2, 3), (1, 1)),)},
"properties": {"id": "first", "pop": 1234, "bandz~red": 1, "bandz~green": 2},
}),
DictSubSet({
"type": "Feature",
"geometry": {"type": "Polygon", "coordinates": (((4, 2), (5, 4), (3, 4), (4, 2)),)},
"properties": {"id": "second", "pop": 5678, "bandz~red": 3, "bandz~green": 4},
}),
]
})
vc2 = vc1.with_cube(cube)
assert vc1.to_geojson() == DictSubSet(
{
"type": "FeatureCollection",
"features": [
DictSubSet(
{
"type": "Feature",
"geometry": {"type": "Polygon", "coordinates": (((1, 1), (3, 1), (2, 3), (1, 1)),)},
"properties": {"id": "first", "pop": 1234},
}
),
DictSubSet(
{
"type": "Feature",
"geometry": {"type": "Polygon", "coordinates": (((4, 2), (5, 4), (3, 4), (4, 2)),)},
"properties": {"id": "second", "pop": 5678},
}
),
],
}
)
assert vc2.to_geojson() == DictSubSet(
{
"type": "FeatureCollection",
"features": [
DictSubSet(
{
"type": "Feature",
"geometry": {"type": "Polygon", "coordinates": (((1, 1), (3, 1), (2, 3), (1, 1)),)},
"properties": {"id": "first", "pop": 1234, "red": 1, "green": 2},
}
),
DictSubSet(
{
"type": "Feature",
"geometry": {"type": "Polygon", "coordinates": (((4, 2), (5, 4), (3, 4), (4, 2)),)},
"properties": {"id": "second", "pop": 5678, "red": 3, "green": 4},
}
),
],
}
)
assert vc2.to_geojson(flatten_prefix="bandz") == DictSubSet(
{
"type": "FeatureCollection",
"features": [
DictSubSet(
{
"type": "Feature",
"geometry": {"type": "Polygon", "coordinates": (((1, 1), (3, 1), (2, 3), (1, 1)),)},
"properties": {"id": "first", "pop": 1234, "bandz~red": 1, "bandz~green": 2},
}
),
DictSubSet(
{
"type": "Feature",
"geometry": {"type": "Polygon", "coordinates": (((4, 2), (5, 4), (3, 4), (4, 2)),)},
"properties": {"id": "second", "pop": 5678, "bandz~red": 3, "bandz~green": 4},
}
),
],
}
)

@pytest.mark.parametrize(["geojson", "expected"], [
(
Expand Down Expand Up @@ -342,7 +375,7 @@ def test_from_geometry(self, geometry, expected):
],
)
def test_from_fiona(self, path, driver):
vc = DriverVectorCube.from_fiona([path], driver=driver)
vc = DriverVectorCube.from_fiona([path], driver=driver, columns_for_cube=[])
assert vc.to_geojson() == DictSubSet(
{
"type": "FeatureCollection",
Expand Down
Loading

0 comments on commit 082b812

Please sign in to comment.