Add bulk zonal stats option using pyqgis - first version (theroggy#42)

KriWay-LV · May 4, 2023 · 8c64963 · 8c64963
1 parent c6ff7a1
commit 8c64963
Show file tree

Hide file tree

Showing 12 changed files with 519 additions and 246 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Improvements
 
+- Improve performance of zonal_stats_bulk (#38)
 - Use black to comply to pep8 + minor general improvements (#13)
 - Upgrade all dependencies (#12)
 - Add support for pandas 2.0 (#41)

diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ Once you have anaconda installed, you can open an anaconda terminal window and f
       ```
       2. Install the dependencies for the crop classification scripts:
       ```
-      conda install python=3.9 geopandas geofileops "h5py<3" openeo psutil rasterio rasterstats scikit-learn
+      conda install python=3.9 geopandas geofileops "h5py<3" openeo psutil qgis rasterio rasterstats scikit-learn
       ```
       3. If it was the first time you installed anaconda/geopandas, you might have to restart your computer to proceed
       4. Start the anaconda terminal window again and activate the environment

diff --git a/cropclassification/calc_timeseries.py b/cropclassification/calc_timeseries.py
@@ -175,9 +175,9 @@ def calc_timeseries_task(config_paths: List[Path], default_basedir: Path):
     try:
         images_bands = [(path, ["VV", "VH"]) for path in input_image_paths]
         zonal_stats_bulk.zonal_stats(
-            features_path=input_features_path,
+            vector_path=input_features_path,
             id_column=conf.columns["id"],
-            images_bands=images_bands,
+            rasters_bands=images_bands,
             output_dir=output_dir,
             temp_dir=temp_dir,
             log_dir=log_dir,
@@ -248,9 +248,9 @@ def calc_timeseries_task(config_paths: List[Path], default_basedir: Path):
         bands = conf.timeseries.getlist("s2bands")
         images_bands = [(path, bands) for path in input_image_paths]
         zonal_stats_bulk.zonal_stats(
-            features_path=input_features_path,
+            vector_path=input_features_path,
             id_column=conf.columns["id"],
-            images_bands=images_bands,
+            rasters_bands=images_bands,
             output_dir=output_dir,
             temp_dir=temp_dir,
             log_dir=log_dir,
@@ -292,9 +292,9 @@ def calc_timeseries_task(config_paths: List[Path], default_basedir: Path):
     try:
         images_bands = [(path, ["VV", "VH"]) for path in input_image_paths]
         zonal_stats_bulk.zonal_stats(
-            features_path=input_features_path,
+            vector_path=input_features_path,
             id_column=conf.columns["id"],
-            images_bands=images_bands,
+            rasters_bands=images_bands,
             output_dir=output_dir,
             temp_dir=temp_dir,
             log_dir=log_dir,

diff --git a/cropclassification/predict/classification_keras.py b/cropclassification/predict/classification_keras.py
@@ -335,7 +335,9 @@ def predict_proba(
         logger.info(f"Resulting Columns for predicting data: {parcel_data_df.columns}")
 
     # Load the classifier and predict
-    model = tf.keras.models.load_model(classifier_path)
+    model = tf.keras.models.load_model(str(classifier_path))
+    if model is None:
+        raise RuntimeError(f"Error loading model {classifier_path}")
     logger.info(f"Predict classes with probabilities: {len(parcel_df.index)} rows")
     class_proba = model.predict(parcel_data_df)
     logger.info("Predict classes with probabilities ready")

diff --git a/cropclassification/preprocess/_timeseries_calc_openeo.py b/cropclassification/preprocess/_timeseries_calc_openeo.py
@@ -55,13 +55,12 @@ def calculate_periodic_timeseries(
     if temp_dir == "None":
         temp_dir = Path(tempfile.gettempdir())
     zonal_stats_bulk.zonal_stats(
-        features_path=input_parcel_path,
+        vector_path=input_parcel_path,
         id_column=conf.columns["id"],
-        images_bands=images_bands,
+        rasters_bands=images_bands,
         output_dir=dest_data_dir,
-        temp_dir=temp_dir,
-        log_dir=conf.dirs.getpath("log_dir"),
-        log_level=conf.general.get("log_level"),
+        stats=["count", "mean", "median", "std", "min", "max"],
+        engine="pyqgis",
     )
 
     """

diff --git a/cropclassification/util/zonal_stats_bulk/__init__.py b/cropclassification/util/zonal_stats_bulk/__init__.py
@@ -1,17 +1,15 @@
 from pathlib import Path
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple
 
 from ._raster_helper import *  # noqa: F401, F403
 
 
 def zonal_stats(
-    features_path: Path,
+    vector_path: Path,
     id_column: str,
-    images_bands: List[Tuple[Path, List[str]]],
+    rasters_bands: List[Tuple[Path, List[str]]],
     output_dir: Path,
-    temp_dir: Path,
-    log_dir: Path,
-    log_level: Union[str, int],
+    stats: Literal["count", "mean", "median", "std", "min", "max"],
     cloud_filter_band: Optional[str] = None,
     calc_bands_parallel: bool = True,
     engine: str = "rasterstats",
@@ -21,44 +19,39 @@ def zonal_stats(
     Calculate zonal statistics.
 
     Args:
-        features_path (Path): _description_
+        vector_path (Path): _description_
         id_column (str): _description_
-        images_bands (List[Tuple[Path, List[str]]]): _description_
+        rasters_bands (List[Tuple[Path, List[str]]]): _description_
         output_dir (Path): _description_
-        temp_dir (Path): _description_
-        log_dir (Path): _description_
-        log_level (Union[str, int]): _description_
         force (bool, optional): _description_. Defaults to False.
 
     Raises:
         Exception: _description_
     """
     if engine == "pyqgis":
-        from _zonal_stats_bulk_pyqgis import zonal_stats
+        if cloud_filter_band is not None:
+            raise ValueError(
+                'cloud_filter_band parameter not supported for engine "pyqgis"'
+            )
+        from . import _zonal_stats_bulk_pyqgis
 
-        return zonal_stats(
-            features_path=features_path,
-            id_column=id_column,
-            images_bands=images_bands,
+        return _zonal_stats_bulk_pyqgis.zonal_stats(
+            vector_path=vector_path,
+            rasters_bands=rasters_bands,
             output_dir=output_dir,
-            temp_dir=temp_dir,
-            log_dir=log_dir,
-            log_level=log_level,
-            cloud_filter_band=cloud_filter_band,
-            calc_bands_parallel=calc_bands_parallel,
+            stats=stats,
+            columns=[id_column],
             force=force,
         )
     elif engine == "rasterstats":
-        from _zonal_stats_bulk_rs import zonal_stats
+        from . import _zonal_stats_bulk_rs
 
-        return zonal_stats(
-            features_path=features_path,
+        return _zonal_stats_bulk_rs.zonal_stats(
+            vector_path=vector_path,
             id_column=id_column,
-            images_bands=images_bands,
+            rasters_bands=rasters_bands,
             output_dir=output_dir,
-            temp_dir=temp_dir,
-            log_dir=log_dir,
-            log_level=log_level,
+            stats=stats,
             cloud_filter_band=cloud_filter_band,
             calc_bands_parallel=calc_bands_parallel,
             force=force,

diff --git a/cropclassification/util/zonal_stats_bulk/_general_helper.py b/cropclassification/util/zonal_stats_bulk/_general_helper.py
@@ -1,7 +1,7 @@
 from datetime import datetime
 import logging
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 logger = logging.getLogger(__name__)
 
@@ -11,7 +11,7 @@ def _format_output_path(
     image_path: Path,
     output_dir: Path,
     orbit_properties_pass: Optional[str],
-    band: Optional[str],
+    band: Optional[Union[str, int]],
 ) -> Path:
     """
     Prepare the output path.
@@ -40,9 +40,9 @@ def _format_output_path(
 def _format_progress_message(
     nb_todo: int,
     nb_done_total: int,
-    nb_done_latestbatch: int,
     start_time: datetime,
-    start_time_latestbatch: datetime,
+    nb_done_latestbatch: Optional[int] = None,
+    start_time_latestbatch: Optional[datetime] = None,
 ) -> str:
     """
     Returns a progress message based on the input.
@@ -55,9 +55,11 @@ def _format_progress_message(
         start_time_latestbatch: datetime the latest batch started
     """
     time_passed_s = (datetime.now() - start_time).total_seconds()
-    time_passed_latestbatch_s = (
-        datetime.now() - start_time_latestbatch
-    ).total_seconds()
+    time_passed_latestbatch_s = None
+    if start_time_latestbatch is not None:
+        time_passed_latestbatch_s = (
+            datetime.now() - start_time_latestbatch
+        ).total_seconds()
 
     # Calculate the overall progress
     large_number = 9999999999
@@ -68,17 +70,20 @@ def _format_progress_message(
     hours_to_go = (int)((nb_todo - nb_done_total) / nb_per_hour)
     min_to_go = (int)((((nb_todo - nb_done_total) / nb_per_hour) % 1) * 60)
 
-    # Calculate the speed of the latest batch
-    if time_passed_latestbatch_s > 0:
+    # Format message
+    message = (
+        f"{hours_to_go}:{min_to_go} left for {nb_todo-nb_done_total} todo at "
+        f"{nb_per_hour:0.0f}/h"
+    )
+    # Add speed of the latest batch to message if appropriate
+    if (
+        time_passed_latestbatch_s is not None
+        and nb_done_latestbatch is not None
+        and time_passed_latestbatch_s > 0
+    ):
         nb_per_hour_latestbatch = (
             nb_done_latestbatch / time_passed_latestbatch_s
         ) * 3600
-    else:
-        nb_per_hour_latestbatch = large_number
+        message = f"{message} ({nb_per_hour_latestbatch:0.0f}/h last batch)"
 
-    # Return formatted message
-    message = (
-        f"{hours_to_go}:{min_to_go} left for {nb_todo-nb_done_total} todo at "
-        f"{nb_per_hour:0.0f}/h ({nb_per_hour_latestbatch:0.0f}/h last batch)"
-    )
     return message