From 1024e79fbdeae3250255e270c6df982488e4c882 Mon Sep 17 00:00:00 2001
From: Leonardo Schwarz <leonardo.schwarz@fgcz.ethz.ch>
Date: Fri, 7 Jun 2024 14:51:48 +0200
Subject: [PATCH] also add hdbscan as an alternative approach

---
 pyproject.toml                                |  3 +-
 src/depiction_targeted_preproc/example/run.py |  1 +
 .../example_compare/run_compare.py            |  3 +-
 .../workflow/exp/plot_compare_peak_density.py |  7 ++-
 .../workflow/experimental.smk                 |  2 +-
 .../workflow/proc/cluster_hdbscan.py          | 49 +++++++++++++++++++
 .../workflow/proc/cluster_kmeans.py           | 10 ++--
 .../workflow/proc/cluster_stats.py            | 33 ++++++++++++-
 .../workflow/qc/plot_peak_density.py          | 19 +++----
 .../workflow/rules/rules_proc.smk             | 15 ++++--
 .../workflow/vis/clustering.py                |  4 ++
 11 files changed, 121 insertions(+), 25 deletions(-)
 create mode 100644 src/depiction_targeted_preproc/workflow/proc/cluster_hdbscan.py

diff --git a/pyproject.toml b/pyproject.toml
index 28ae232..9e5d96f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,7 +61,8 @@ dev = [
     "tifffile>=2024.2.12",
     "xmltodict",
     "licensecheck",
-    "ms-peak-picker"
+    "ms-peak-picker",
+    "hdbscan"
 ]
 
 # potentially relevant in the future again:
diff --git a/src/depiction_targeted_preproc/example/run.py b/src/depiction_targeted_preproc/example/run.py
index ddcc114..64e0882 100644
--- a/src/depiction_targeted_preproc/example/run.py
+++ b/src/depiction_targeted_preproc/example/run.py
@@ -31,6 +31,7 @@
         "cluster_default_kmeans.hdf5",
         "cluster_default_stats_kmeans.csv",
         "cluster_default_kmeans.png",
+        "cluster_default_hdbscan.png",
     ],
 }
 
diff --git a/src/depiction_targeted_preproc/example_compare/run_compare.py b/src/depiction_targeted_preproc/example_compare/run_compare.py
index 32f18b1..11a8748 100644
--- a/src/depiction_targeted_preproc/example_compare/run_compare.py
+++ b/src/depiction_targeted_preproc/example_compare/run_compare.py
@@ -17,8 +17,7 @@ def prepare_tasks(input_imzml_path: Path, work_dir: Path) -> list[Path]:
     requested_files = get_all_output_files(folders)
 
     combined_dir = work_dir / input_imzml_path.stem
-    exp_files = [combined_dir / "exp_compare_cluster_stats.pdf",
-                 combined_dir / "exp_plot_compare_peak_density.pdf"]
+    exp_files = [combined_dir / "exp_compare_cluster_stats.pdf", combined_dir / "exp_plot_compare_peak_density.pdf"]
     return requested_files + exp_files
 
 
diff --git a/src/depiction_targeted_preproc/workflow/exp/plot_compare_peak_density.py b/src/depiction_targeted_preproc/workflow/exp/plot_compare_peak_density.py
index 6f0a436..5f3b665 100644
--- a/src/depiction_targeted_preproc/workflow/exp/plot_compare_peak_density.py
+++ b/src/depiction_targeted_preproc/workflow/exp/plot_compare_peak_density.py
@@ -17,8 +17,11 @@ def exp_plot_compare_peak_density(
     vegafusion.enable()
 
     table = pl.concat(
-        [pl.read_parquet(path).with_columns(variant=pl.lit(path.parents[1].name)) for path in
-         tables_marker_distances_calib])
+        [
+            pl.read_parquet(path).with_columns(variant=pl.lit(path.parents[1].name))
+            for path in tables_marker_distances_calib
+        ]
+    )
     table = pl.concat([table, pl.read_parquet(table_marker_distance_uncalib).with_columns(variant=pl.lit("uncalib"))])
 
     plot_density_combined_full(df_peak_dist=table, out_pdf=output_pdf)
diff --git a/src/depiction_targeted_preproc/workflow/experimental.smk b/src/depiction_targeted_preproc/workflow/experimental.smk
index 1980553..976cc06 100644
--- a/src/depiction_targeted_preproc/workflow/experimental.smk
+++ b/src/depiction_targeted_preproc/workflow/experimental.smk
@@ -9,7 +9,7 @@ exp_variants = ["chem_noise", "mass_cluster", "reg_shift"]
 
 rule exp_compare_cluster_stats:
     input:
-        csv=expand("{{sample}}/{exp_variant}/cluster_default_stats_kmeans.csv", exp_variant=exp_variants)
+        csv=expand("{{sample}}/{exp_variant}/cluster_default_stats_hdbscan.csv", exp_variant=exp_variants)
     output:
         pdf="{sample}/exp_compare_cluster_stats.pdf"
     shell:
diff --git a/src/depiction_targeted_preproc/workflow/proc/cluster_hdbscan.py b/src/depiction_targeted_preproc/workflow/proc/cluster_hdbscan.py
new file mode 100644
index 0000000..343edc7
--- /dev/null
+++ b/src/depiction_targeted_preproc/workflow/proc/cluster_hdbscan.py
@@ -0,0 +1,49 @@
+from pathlib import Path
+from typing import Annotated
+
+import math
+import numpy as np
+import typer
+import xarray
+from hdbscan.flat import (HDBSCAN_flat)
+from sklearn.preprocessing import StandardScaler
+from typer import Option
+
+from depiction.image.multi_channel_image import MultiChannelImage
+from depiction_targeted_preproc.workflow.proc.cluster_kmeans import retain_strongest_signals
+
+
+def cluster_dbscan(input_netcdf_path: Annotated[Path, Option()], output_netcdf_path: Annotated[Path, Option()]) -> None:
+    image = MultiChannelImage.read_hdf5(input_netcdf_path)
+    # TODO make configurable
+    n_clusters = 10
+    n_features = 50
+
+    reduced_data = retain_strongest_signals(image.data_flat.transpose("i", "c"), n_features)
+
+    scaler = StandardScaler()
+    scaler.fit(reduced_data.values)
+
+    # kmeans = BisectingKMeans(n_clusters=n_clusters)
+    # dbscan = (eps=0.3, min_samples=10)
+    data_scaled = scaler.transform(reduced_data.values)
+
+    # clusterer = hdbscan.HDBSCAN()
+    # clusterer.fit(data_scaled)
+    # clusters = clusterer.labels_
+
+    clusterer = HDBSCAN_flat(data_scaled,
+                             cluster_selection_method='eom',
+                             n_clusters=10,
+                             min_cluster_size=math.ceil(0.02*data_scaled.shape[0]))
+    clusters = clusterer.labels_
+
+    cluster_data = xarray.DataArray(clusters, dims=("i",), coords={"i": image.data_flat.coords["i"]}).expand_dims("c")
+    cluster_data.coords["c"] = ["cluster"]
+    cluster_data.attrs["bg_value"] = np.nan
+    cluster_image = MultiChannelImage(cluster_data.unstack("i"))
+    cluster_image.write_hdf5(output_netcdf_path)
+
+
+if __name__ == "__main__":
+    typer.run(cluster_dbscan)
diff --git a/src/depiction_targeted_preproc/workflow/proc/cluster_kmeans.py b/src/depiction_targeted_preproc/workflow/proc/cluster_kmeans.py
index efd20d9..5070eaa 100644
--- a/src/depiction_targeted_preproc/workflow/proc/cluster_kmeans.py
+++ b/src/depiction_targeted_preproc/workflow/proc/cluster_kmeans.py
@@ -5,7 +5,8 @@
 import typer
 import xarray
 from loguru import logger
-from sklearn.cluster import KMeans
+from sklearn.cluster import KMeans, BisectingKMeans
+from sklearn.preprocessing import StandardScaler
 from typer import Option
 
 from depiction.image.multi_channel_image import MultiChannelImage
@@ -31,8 +32,11 @@ def cluster_kmeans(input_netcdf_path: Annotated[Path, Option()], output_netcdf_p
 
     reduced_data = retain_strongest_signals(image.data_flat.transpose("i", "c"), n_features)
 
-    kmeans = KMeans(n_clusters=n_clusters)
-    clusters = kmeans.fit_predict(reduced_data.values)
+    scaler = StandardScaler()
+    scaler.fit(reduced_data.values)
+
+    kmeans = BisectingKMeans(n_clusters=n_clusters)
+    clusters = kmeans.fit_predict(scaler.transform(reduced_data.values))
 
     cluster_data = xarray.DataArray(clusters, dims=("i",), coords={"i": image.data_flat.coords["i"]}).expand_dims("c")
     cluster_data.coords["c"] = ["cluster"]
diff --git a/src/depiction_targeted_preproc/workflow/proc/cluster_stats.py b/src/depiction_targeted_preproc/workflow/proc/cluster_stats.py
index f314af9..bfcb155 100644
--- a/src/depiction_targeted_preproc/workflow/proc/cluster_stats.py
+++ b/src/depiction_targeted_preproc/workflow/proc/cluster_stats.py
@@ -5,12 +5,31 @@
 import polars as pl
 import typer
 from loguru import logger
+from sklearn.metrics import calinski_harabasz_score
+from sklearn.metrics import silhouette_score, davies_bouldin_score
 from typer import Option
 
 from depiction.image.multi_channel_image import MultiChannelImage
 from depiction_targeted_preproc.workflow.proc.__cluster_stats import compute_CHAOS, compute_PAS
 
 
+# from fastdtw import fastdtw
+
+
+def compute_silhouette(cluster_data, cluster_coords):
+    # higher is better
+    return silhouette_score(cluster_coords, cluster_data)
+
+
+def compute_davies_bouldin(cluster_data, cluster_coords):
+    # higher is worse
+    return davies_bouldin_score(cluster_coords, cluster_data)
+
+
+def compute_calinski_harabasz(X, labels):
+    return calinski_harabasz_score(X, labels)
+
+
 def cluster_stats(input_netcdf_path: Annotated[Path, Option()], output_csv_path: Annotated[Path, Option()]) -> None:
     cluster_image = MultiChannelImage.read_hdf5(input_netcdf_path)
 
@@ -28,7 +47,19 @@ def cluster_stats(input_netcdf_path: Annotated[Path, Option()], output_csv_path:
     pas = compute_PAS(cluster_data, cluster_coords)
     logger.info(f"Computed PAS: {pas}")
 
-    metrics_df = pl.DataFrame({"metric": ["CHAOS", "PAS"], "value": [chaos, pas]})
+    silhouette = compute_silhouette(cluster_data, cluster_coords)
+    logger.info(f"Computed silhouette: {silhouette}")
+
+    davies_bouldin = compute_davies_bouldin(cluster_data, cluster_coords)
+    logger.info(f"Computed Davies-Bouldin: {davies_bouldin}")
+
+    calinski_harabasz = compute_calinski_harabasz(cluster_coords, cluster_data)
+    logger.info(f"Computed Calinski-Harabasz: {calinski_harabasz}")
+
+    metrics_df = pl.DataFrame(
+        {"metric": ["CHAOS", "PAS", "Silhouette", "Davies-Bouldin", "Calinski-Harabasz"],
+         "value": [chaos, pas, silhouette, davies_bouldin, calinski_harabasz]}
+    )
     metrics_df.write_csv(output_csv_path)
 
 
diff --git a/src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py b/src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py
index 82b27fb..994a51d 100644
--- a/src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py
+++ b/src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py
@@ -42,7 +42,7 @@ def plot_density_combined_full(df_peak_dist: pl.DataFrame, out_pdf: Path) -> Non
     collect = []
     for variant in variants:
         df_variant = df_peak_dist.filter(variant=variant)
-        dist, density = FFTKDE(bw="ISJ").fit(df_variant["dist"].to_numpy()).evaluate(2 ** 10)
+        dist, density = FFTKDE(bw="ISJ").fit(df_variant["dist"].to_numpy()).evaluate(2**10)
         collect.append(pl.DataFrame({"dist": list(dist), "density": list(density), "variant": variant}))
 
     dist_min = df_peak_dist["dist"].min()
@@ -50,17 +50,12 @@ def plot_density_combined_full(df_peak_dist: pl.DataFrame, out_pdf: Path) -> Non
     df_density = pl.concat(collect).filter((pl.col("dist") >= dist_min) & (pl.col("dist") <= dist_max))
 
     chart = (
-        (
-            alt.Chart(df_density)
-            .mark_line()
-            .encode(x="dist:Q", y="density:Q", color="variant:N")
-            .properties(width=500, height=300)
-        )
-        .properties(title="Linear scale")
-    )
-    chart = chart.properties(
-        title=f"Density of target-surrounding peak distances (N={len(df_peak_dist):,})"
-    )
+        alt.Chart(df_density)
+        .mark_line()
+        .encode(x="dist:Q", y="density:Q", color="variant:N")
+        .properties(width=500, height=300)
+    ).properties(title="Linear scale")
+    chart = chart.properties(title=f"Density of target-surrounding peak distances (N={len(df_peak_dist):,})")
     chart.save(out_pdf)
 
 
diff --git a/src/depiction_targeted_preproc/workflow/rules/rules_proc.smk b/src/depiction_targeted_preproc/workflow/rules/rules_proc.smk
index ce820b2..6bdfaa9 100644
--- a/src/depiction_targeted_preproc/workflow/rules/rules_proc.smk
+++ b/src/depiction_targeted_preproc/workflow/rules/rules_proc.smk
@@ -68,11 +68,20 @@ rule proc_cluster_kmeans:
         "python -m depiction_targeted_preproc.workflow.proc.cluster_kmeans "
         " --input-netcdf-path {input.netcdf} --output-netcdf-path {output.netcdf}"
 
-rule proc_cluster_stats_kmeans:
+rule proc_cluster_hdbscan:
     input:
-        netcdf="{sample}/cluster_default_kmeans.hdf5"
+        netcdf="{sample}/images_default.hdf5"
+    output:
+        netcdf="{sample}/cluster_default_hdbscan.hdf5"
+    shell:
+        "python -m depiction_targeted_preproc.workflow.proc.cluster_hdbscan "
+        " --input-netcdf-path {input.netcdf} --output-netcdf-path {output.netcdf}"
+
+rule proc_cluster_stats:
+    input:
+        netcdf="{sample}/cluster_default_{variant}.hdf5"
     output:
-        csv="{sample}/cluster_default_stats_kmeans.csv"
+        csv="{sample}/cluster_default_stats_{variant}.csv"
     shell:
         "python -m depiction_targeted_preproc.workflow.proc.cluster_stats"
         " --input-netcdf-path {input.netcdf} --output-csv-path {output.csv}"
diff --git a/src/depiction_targeted_preproc/workflow/vis/clustering.py b/src/depiction_targeted_preproc/workflow/vis/clustering.py
index 43a6580..0937f92 100644
--- a/src/depiction_targeted_preproc/workflow/vis/clustering.py
+++ b/src/depiction_targeted_preproc/workflow/vis/clustering.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Annotated
 
+import numpy as np
 import typer
 import xarray
 from matplotlib import pyplot as plt
@@ -14,6 +15,9 @@ def vis_clustering(input_netcdf_path: Annotated[Path, Option()], output_png_path
     source_image.plot(cmap="tab10", ax=ax)
     ax.set_aspect("equal")
 
+    #n_classes = len(set(source_image.values.ravel()))
+    n_classes = len(np.unique(source_image.values))
+    fig.suptitle(f"Clustering with {n_classes} classes")
     fig.tight_layout()
     fig.savefig(output_png_path)