Merge pull request #10 from SenteraLLC/dem-464-mosaicco-rs

[DEM-464] Support more than just NDVI spectral index
SenteraLLC · Jan 29, 2024 · 178bf1b · 178bf1b
2 parents 965512b + eff44b1
commit 178bf1b
Show file tree

Hide file tree

Showing 9 changed files with 1,010 additions and 783 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,15 +1,15 @@
 repos:
 - repo: https://github.com/pycqa/isort
-  rev: 5.12.0
+  rev: 5.13.2
   hooks:
     - id: isort
       name: isort (python)
 - repo: https://github.com/ambv/black
-  rev: 23.11.0
+  rev: 24.1.1
   hooks:
     - id: black
 - repo: https://github.com/pycqa/flake8
-  rev: '6.1.0'
+  rev: '7.0.0'
   hooks:
     - id: flake8
       exclude: (tests|doc)

diff --git a/pixels_utils/_version.py b/pixels_utils/_version.py
@@ -1,3 +1,3 @@
 """Defines package version.  Parsed by setup.py and imported by __init__.py."""
 
-__version__ = "0.4.1"
+__version__ = "0.5.0"
diff --git a/pixels_utils/helpers/__init__.py b/pixels_utils/helpers/__init__.py
@@ -0,0 +1,6 @@
+from pixels_utils.helpers._helper import filter_scenes_with_all_nodata, get_satellite_expression_as_df
+
+__all__ = (
+    "filter_scenes_with_all_nodata",
+    "get_satellite_expression_as_df",
+)
diff --git a/pixels_utils/helpers/_helper.py b/pixels_utils/helpers/_helper.py
@@ -0,0 +1,168 @@
+import logging
+from datetime import datetime
+from typing import Union
+
+from geojson.feature import Feature
+from numpy import float32
+from pandas import DataFrame, Series, concat
+from rasterio import MemoryFile
+from rasterio.enums import Resampling
+from rasterio.sample import sample_gen
+from tqdm import tqdm
+
+from pixels_utils.scenes import parse_nested_stac_data
+from pixels_utils.stac_catalogs.earthsearch.v1 import EARTHSEARCH_SCENE_URL, EarthSearchCollections, Expression
+from pixels_utils.titiler import TITILER_ENDPOINT
+from pixels_utils.titiler.endpoints.stac import Crop, QueryParamsCrop, QueryParamsStatistics, Statistics
+from pixels_utils.titiler.mask.enum_classes import Sentinel2_SCL_Group
+
+
+def filter_scenes_with_all_nodata(
+    df_scenes: DataFrame,
+    feature: Feature,
+    expression_obj: Expression,
+    gsd: Union[float, int] = 10,
+    nodata: Union[float, int] = -999,
+) -> list:
+    """
+    Finds the index of all scenes without any valid pixels across the extent of the field group geometry.
+
+    Note:
+        The Titiler/sentera.pixels endpoint has a nuance that if all pixels within a spatial query are NODATA, then
+        the Statistics endpoint returns statistics values equivalent to the NODATA value set. This function finds
+        the index of all scenes where this is the case so they can be removed from the list of scenes to process.
+
+    Args:
+        df_scenes (DataFrame): Initial list of scenes to process.
+
+    Returns:
+        list: Index values of df_scenes that can safely be removed because they do not have any valid pixels.
+    """
+    df_scene_properties = parse_nested_stac_data(df=df_scenes, column="properties")
+
+    ind_nodata = []
+    for ind in tqdm(
+        list(df_scenes.index),
+        desc="Filtering out scenes where clouds are covering the plot area",
+    ):
+        scene = df_scenes.loc[ind]
+        properties = df_scene_properties.loc[ind]
+        date = datetime.strptime(properties["datetime"].split("T")[0], "%Y-%m-%d")
+        # if date > datetime(2023, 5, 30):
+        #     break
+
+        scene_url = EARTHSEARCH_SCENE_URL.format(collection=EarthSearchCollections.sentinel_2_l2a.name, id=scene["id"])
+
+        query_params = QueryParamsStatistics(
+            url=scene_url,
+            feature=feature,
+            expression=expression_obj.expression,
+            asset_as_band=True,
+            coord_crs=None,
+            gsd=gsd,
+            nodata=nodata,
+            resampling=Resampling.nearest.name,
+        )
+
+        stats_arable_wlist = Statistics(
+            query_params=query_params,
+            clear_cache=False,
+            titiler_endpoint=TITILER_ENDPOINT,
+            mask_enum=Sentinel2_SCL_Group.ARABLE,
+            mask_asset="scl",
+            whitelist=True,
+        )
+
+        expression_ = list(stats_arable_wlist.response.json()["properties"]["statistics"].keys())[0]
+        stats = stats_arable_wlist.response.json()["properties"]["statistics"][expression_]
+        # if stats["mean"] == nodata:
+        if stats["min"] == nodata:
+            # Indicates at least one pixel from statistics query is set to NODATA (i.e., masked)
+            logging.debug(
+                "%s: No valid pixels available",
+                date.date(),
+            )
+            ind_nodata += [ind]
+
+        else:
+            logging.debug(
+                "%s: %s valid pixels (Mean NDVI = %s)",
+                date.date(),
+                stats["count"],
+                round(stats["mean"], 2),
+            )
+    return ind_nodata
+
+
+def get_satellite_expression_as_df(
+    gdf_fields_subset: DataFrame,
+    scene: Series,
+    feature: Feature,
+    expression_obj: Expression,
+    gsd: Union[float, int],
+    nodata: Union[float, int],
+) -> DataFrame:
+    """
+    Retrieves image for `scene_url` via sentera.pixels.com based on `expression_obj`, then samples the NDVI value for
+    each centroid in `gdf_fields_subset`.
+
+
+    Returns:
+        DataFrame: NDVI values with "field_id" and "geom_id" join keys.
+    """
+    query_params_crop_ndvi = QueryParamsCrop(
+        url=EARTHSEARCH_SCENE_URL.format(collection=EarthSearchCollections.sentinel_2_l2a.name, id=scene["id"]),
+        feature=feature,
+        gsd=gsd,
+        format_=".tif",
+        expression=expression_obj.expression,
+        asset_as_band=True,
+        nodata=nodata,
+        resampling=Resampling.nearest.name,
+    )
+    crop_arable_wlist = Crop(
+        query_params=query_params_crop_ndvi,
+        clear_cache=False,
+        titiler_endpoint=TITILER_ENDPOINT,
+        mask_enum=Sentinel2_SCL_Group.ARABLE,
+        mask_asset="scl",
+        whitelist=True,
+    )
+    data_mask, profile_mask, tags = crop_arable_wlist.to_rasterio(
+        **{
+            "dtype": float32,
+            "band_names": [expression_obj.short_name],
+            "band_description": [expression_obj.short_name],
+        }
+    )
+
+    with MemoryFile() as memfile:
+        with memfile.open(**profile_mask) as src:  # Open as DatasetWriter
+            src.write(data_mask)
+
+            # Do not sort xy in sample_gen function; order must be maintained for merging to primary keys
+            df_data = DataFrame(
+                data=sample_gen(
+                    src,
+                    xy=list(
+                        gdf_fields_subset[gdf_fields_subset.geometry.name].apply(
+                            lambda geometry: geometry.centroid.coords[0]
+                        )
+                    ),
+                ),
+                # columns=["ndvi"],
+                columns=[expression_obj.short_name],
+            )
+    # Remove any rows where ndvi is set to NODATA
+    # df_data.drop(df_data.loc[df_data["ndvi"] == nodata].index, inplace=True)
+    df_data.drop(df_data.loc[df_data[expression_obj.short_name] == nodata].index, inplace=True)
+    df_data.insert(0, "scene_id", scene["id"])
+    df_data.insert(1, "datetime", scene["datetime"])
+    df_data = concat(
+        [
+            gdf_fields_subset.reset_index(drop=True),
+            df_data,
+        ],
+        axis=1,
+    )
+    return df_data
diff --git a/pixels_utils/scenes/__init__.py b/pixels_utils/scenes/__init__.py
@@ -1,3 +1,3 @@
-from ._scenes import parse_nested_stac_data, request_asset_info, search_stac_scenes
+from pixels_utils.scenes._scenes import parse_nested_stac_data, request_asset_info, search_stac_scenes
 
 __all__ = ("parse_nested_stac_data", "request_asset_info", "search_stac_scenes")
diff --git a/pixels_utils/stac_catalogs/earthsearch/v1.py b/pixels_utils/stac_catalogs/earthsearch/v1.py
@@ -31,6 +31,14 @@ def _version(self):
         return "v1"
 
 
+def _generate_experession_override(expression: str, assets: list[str], assets_override: list[str]) -> str:
+    """Return expression_override with assets replaced by assets_override."""
+    expression_override = expression
+    for asset_common, asset_collection in zip(assets, assets_override):
+        expression_override = expression_override.replace(asset_common, asset_collection)
+    return expression_override
+
+
 def expression_from_collection(
     collection: EarthSearchCollections = EarthSearchCollections.sentinel_2_l2a, spectral_index: str = "NDVI"
 ) -> Expression:
@@ -49,21 +57,56 @@ def expression_from_collection(
     Returns:
         Expression: The tailored Expression object.
     """
-    assert collection._version == "v1", (
-        f"The `collection` and `expression_from_collection()` function versions do not match ({collection._version} vs "
-        f'v1). Ensure both the "{collection}" and expression_from_collection() function are imported from the same '
-        "version of `pixels_utils.stac_catalogs.earthsearch`."
-    )
+    if spectral_index not in spyndex_indices.keys():
+        raise AssertionError(f"{spectral_index} is not a valid spectral index (must be available via spyndex.indices).")
+    if collection._version != "v1":
+        raise AssertionError(
+            f"The `collection` and `expression_from_collection()` function versions do not match ({collection._version} vs "
+            f'v1). Ensure both the "{collection}" and expression_from_collection() function are imported from the same '
+            "version of `pixels_utils.stac_catalogs.earthsearch`."
+        )
     stac_collection_url = EARTHSEARCH_COLLECTION_URL.format(collection=collection.name)
     stac_metadata = STACMetaData(collection_url=stac_collection_url)
     # TODO: stac_metadata.parse_asset_bands("eo:bands", return_dataframe=True) for support of EarthSearch v0
-    assert set(Expression(spyndex_object=spyndex_indices[spectral_index]).assets).issubset(
-        set(stac_metadata.asset_names)
-    ), f'Assets for spectral index "{spectral_index}" are not available in collection "{collection}".'
-    assert (
-        spectral_index in spyndex_indices.keys()
-    ), f"{spectral_index} is not a valid spectral index (must be available via spyndex.indices)."
-    return Expression(spyndex_object=spyndex_indices["NDVI"])
+
+    # get list of `name` values where `common_name` matches spectral_index
+    assets_ = Expression(spyndex_object=spyndex_indices[spectral_index]).assets
+    expression_str_ = Expression(spyndex_object=spyndex_indices[spectral_index]).expression
+
+    df_collection_assets = stac_metadata.parse_asset_bands("eo:bands", return_dataframe=True)
+    df_expr_assets_ = df_collection_assets[df_collection_assets["common_name"].isin(assets_)][
+        ["name", "common_name"]
+    ].drop_duplicates(subset="common_name")
+
+    # Check if all `common_names` match `names` (i.e., collection_names)
+    if any([row["name"] != row["common_name"] for _, row in df_expr_assets_.iterrows()]):
+        # Raise error if collection does not have assets available
+        for asset in assets_:
+            if asset not in df_expr_assets_["common_name"].values:
+                raise ValueError(
+                    f'"{asset}" is required for spectral index "{spectral_index}", but is not a valid asset (i.e., '
+                    f'"common_name") in collection "{collection}".\nValid "common_names":\n'
+                    f'{df_collection_assets["common_name"].values}'
+                )
+        # Set assets_override and expression_override based on available `name` values for the collection.
+        assets_override = [
+            df_expr_assets_[df_expr_assets_["common_name"] == asset]["name"].values[0] for asset in assets_
+        ]
+        expression_override = _generate_experession_override(expression_str_, assets_, assets_override)
+    else:
+        assets_override, expression_override = None, None
+
+    expression = Expression(
+        spyndex_object=spyndex_indices[spectral_index],
+        assets_override=assets_override,
+        expression_override=expression_override,
+    )
+
+    if not set(expression.assets).issubset(set(stac_metadata.asset_names)):
+        raise AssertionError(
+            f'Assets for spectral index "{spectral_index}" are not available in collection "{collection}".'
+        )
+    return expression
 
 
 def expressions_from_collection(
@@ -79,11 +122,12 @@ def expressions_from_collection(
     Returns:
         Expressions: The Expressions object containing all available spectral indices for the given collection.
     """
-    assert collection._version == "v1", (
-        f"The `collection` and `expression_from_collection()` function versions do not match ({collection._version} vs "
-        f'v1). Ensure both the "{collection}" and expression_from_collection() function are imported from the same '
-        "version of `pixels_utils.stac_catalogs.earthsearch`."
-    )
+    if collection._version != "v1":
+        raise AssertionError(
+            f"The `collection` and `expression_from_collection()` function versions do not match ({collection._version} vs "
+            f'v1). Ensure both the "{collection}" and expression_from_collection() function are imported from the same '
+            "version of `pixels_utils.stac_catalogs.earthsearch`."
+        )
     # Get list of assets in collection and filter spyndex_bands by collection assets
     stac_collection_url = EARTHSEARCH_COLLECTION_URL.format(collection=collection.name)
     stac_metadata = STACMetaData(collection_url=stac_collection_url)

diff --git a/pixels_utils/titiler/endpoints/stac/crop/_crop.py b/pixels_utils/titiler/endpoints/stac/crop/_crop.py
@@ -359,7 +359,7 @@ def to_rasterio(self, **kwargs) -> Tuple[ArrayLike, Profile, Dict]:
         # TODO: Consider validating kwargs before passing to parse_crop_response()
         data_mask, profile_mask, tags = parse_crop_response(
             r=self.response,
-            **kwargs
+            **kwargs,
             # **{"dtype": float32, "band_names": [collection_ndvi.short_name], "band_description": [collection_ndvi.short_name]},
         )
         return data_mask, profile_mask, tags