continue

fgcz · Jun 7, 2024 · c4543e9 · c4543e9
1 parent 1024e79
commit c4543e9
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 29 deletions.
diff --git a/src/depiction_targeted_preproc/example_compare/run_compare.py b/src/depiction_targeted_preproc/example_compare/run_compare.py
@@ -27,6 +27,7 @@ def main() -> None:
 
     imzmls = [
         "menzha_20231208_s607923_tonsil-repro-sample-01.imzML",
+        "menzha_20231208_s607923_tonsil-repro-sample-02.imzML",
         "menzha_20231208_s607930_64074-b20-30928-a.imzML",
         "menzha_20240212_tonsil_06-50.imzML",
     ]
@@ -35,7 +36,7 @@ def main() -> None:
     for imzml in imzmls:
         requested_files += prepare_tasks(data_raw_dir / imzml, work_dir=work_dir)
 
-    SnakemakeInvoke().invoke(work_dir=work_dir, result_files=requested_files, n_cores=4)
+    SnakemakeInvoke(continue_on_error=True).invoke(work_dir=work_dir, result_files=requested_files, n_cores=4)
 
 
 def get_all_output_files(folders: list[Path]) -> list[Path]:

diff --git a/src/depiction_targeted_preproc/workflow/proc/cluster_hdbscan.py b/src/depiction_targeted_preproc/workflow/proc/cluster_hdbscan.py
@@ -6,6 +6,7 @@
 import typer
 import xarray
 from hdbscan.flat import (HDBSCAN_flat)
+from loguru import logger
 from sklearn.preprocessing import StandardScaler
 from typer import Option
 
@@ -28,15 +29,14 @@ def cluster_dbscan(input_netcdf_path: Annotated[Path, Option()], output_netcdf_p
     # dbscan = (eps=0.3, min_samples=10)
     data_scaled = scaler.transform(reduced_data.values)
 
-    # clusterer = hdbscan.HDBSCAN()
-    # clusterer.fit(data_scaled)
-    # clusters = clusterer.labels_
-
-    clusterer = HDBSCAN_flat(data_scaled,
-                             cluster_selection_method='eom',
-                             n_clusters=10,
-                             min_cluster_size=math.ceil(0.02*data_scaled.shape[0]))
-    clusters = clusterer.labels_
+    try:
+        clusterer = HDBSCAN_flat(data_scaled,
+                                 n_clusters=10)
+                                 #min_cluster_size=math.ceil(0.02 * data_scaled.shape[0]))
+        clusters = clusterer.labels_
+    except IndexError:
+        logger.error("No clusters found")
+        clusters = np.zeros(data_scaled.shape[0])
 
     cluster_data = xarray.DataArray(clusters, dims=("i",), coords={"i": image.data_flat.coords["i"]}).expand_dims("c")
     cluster_data.coords["c"] = ["cluster"]

diff --git a/src/depiction_targeted_preproc/workflow/proc/cluster_stats.py b/src/depiction_targeted_preproc/workflow/proc/cluster_stats.py
@@ -13,12 +13,12 @@
 from depiction_targeted_preproc.workflow.proc.__cluster_stats import compute_CHAOS, compute_PAS
 
 
-# from fastdtw import fastdtw
-
-
 def compute_silhouette(cluster_data, cluster_coords):
     # higher is better
-    return silhouette_score(cluster_coords, cluster_data)
+    try:
+        return silhouette_score(cluster_coords, cluster_data)
+    except ValueError:
+        return np.nan
 
 
 def compute_davies_bouldin(cluster_data, cluster_coords):
@@ -27,20 +27,13 @@ def compute_davies_bouldin(cluster_data, cluster_coords):
 
 
 def compute_calinski_harabasz(X, labels):
-    return calinski_harabasz_score(X, labels)
+    try:
+        return calinski_harabasz_score(X, labels)
+    except ValueError:
+        return np.nan
 
 
-def cluster_stats(input_netcdf_path: Annotated[Path, Option()], output_csv_path: Annotated[Path, Option()]) -> None:
-    cluster_image = MultiChannelImage.read_hdf5(input_netcdf_path)
-
-    cluster_data = cluster_image.data_flat.values.ravel()
-    cluster_coords = np.hstack(
-        (
-            cluster_image.data_flat.coords["x"].values.reshape(-1, 1),
-            cluster_image.data_flat.coords["y"].values.reshape(-1, 1),
-        )
-    )
-
+def compute_metrics(cluster_data: np.ndarray, cluster_coords: np.ndarray) -> dict[str, float]:
     chaos = compute_CHAOS(cluster_data, cluster_coords)
     logger.info(f"Computed CHAOS: {chaos}")
 
@@ -56,9 +49,30 @@ def cluster_stats(input_netcdf_path: Annotated[Path, Option()], output_csv_path:
     calinski_harabasz = compute_calinski_harabasz(cluster_coords, cluster_data)
     logger.info(f"Computed Calinski-Harabasz: {calinski_harabasz}")
 
+    return {
+        "CHAOS": chaos,
+        "PAS": pas,
+        "Silhouette": silhouette,
+        "Davies-Bouldin": davies_bouldin,
+        "Calinski-Harabasz": calinski_harabasz
+    }
+
+
+def cluster_stats(input_netcdf_path: Annotated[Path, Option()], output_csv_path: Annotated[Path, Option()]) -> None:
+    cluster_image = MultiChannelImage.read_hdf5(input_netcdf_path)
+
+    cluster_data = cluster_image.data_flat.values.ravel()
+    cluster_coords = np.hstack(
+        (
+            cluster_image.data_flat.coords["x"].values.reshape(-1, 1),
+            cluster_image.data_flat.coords["y"].values.reshape(-1, 1),
+        )
+    )
+
+    metrics = compute_metrics(cluster_data, cluster_coords)
+
     metrics_df = pl.DataFrame(
-        {"metric": ["CHAOS", "PAS", "Silhouette", "Davies-Bouldin", "Calinski-Harabasz"],
-         "value": [chaos, pas, silhouette, davies_bouldin, calinski_harabasz]}
+        {"metric": list(metrics.keys()), "value": list(metrics.values())}
     )
     metrics_df.write_csv(output_csv_path)
 

diff --git a/src/depiction_targeted_preproc/workflow/snakemake_invoke.py b/src/depiction_targeted_preproc/workflow/snakemake_invoke.py
@@ -10,6 +10,7 @@
 @dataclass
 class SnakemakeInvoke:
     use_subprocess: bool = True
+    continue_on_error: bool = False
 
     def invoke(self, work_dir: Path, result_files: list[Path], n_cores: int = 1) -> None:
         if self.use_subprocess:
@@ -31,6 +32,7 @@ def _invoke_direct(self, work_dir: Path, result_files: list[Path], n_cores: int)
         from snakemake.settings import StorageSettings
         from snakemake.settings import ResourceSettings
         from snakemake.settings import DAGSettings
+        from snakemake.settings import ExecutionSettings
 
         with SnakemakeApi(
             OutputSettings(
@@ -47,12 +49,15 @@ def _invoke_direct(self, work_dir: Path, result_files: list[Path], n_cores: int)
             dag_api = workflow_api.dag(
                 dag_settings=DAGSettings(targets=[str(p) for p in result_files], force_incomplete=True)
             )
-            dag_api.execute_workflow()
+            dag_api.execute_workflow(execution_settings=ExecutionSettings(keep_going=self.continue_on_error))
 
     def _invoke_subprocess(self, work_dir: Path, result_files: list[Path], n_cores: int) -> None:
         snakemake_bin = shutil.which("snakemake")
         if snakemake_bin is None:
             raise RuntimeError(f"snakemake not found, check PATH: {os.environ['PATH']}")
+        extra_args = []
+        if self.continue_on_error:
+            extra_args.append("--keep-going")
         command = [
             snakemake_bin,
             "-d",
@@ -61,6 +66,7 @@ def _invoke_subprocess(self, work_dir: Path, result_files: list[Path], n_cores:
             str(n_cores),
             "--snakefile",
             str(self.snakefile_path),
+            *extra_args,
             *[str(file.relative_to(work_dir)) for file in result_files],
         ]
         logger.info("Executing {command}", command=command)