From 1024e79fbdeae3250255e270c6df982488e4c882 Mon Sep 17 00:00:00 2001 From: Leonardo Schwarz Date: Fri, 7 Jun 2024 14:51:48 +0200 Subject: [PATCH] also add hdbscan as an alternative approach --- pyproject.toml | 3 +- src/depiction_targeted_preproc/example/run.py | 1 + .../example_compare/run_compare.py | 3 +- .../workflow/exp/plot_compare_peak_density.py | 7 ++- .../workflow/experimental.smk | 2 +- .../workflow/proc/cluster_hdbscan.py | 49 +++++++++++++++++++ .../workflow/proc/cluster_kmeans.py | 10 ++-- .../workflow/proc/cluster_stats.py | 33 ++++++++++++- .../workflow/qc/plot_peak_density.py | 19 +++---- .../workflow/rules/rules_proc.smk | 15 ++++-- .../workflow/vis/clustering.py | 4 ++ 11 files changed, 121 insertions(+), 25 deletions(-) create mode 100644 src/depiction_targeted_preproc/workflow/proc/cluster_hdbscan.py diff --git a/pyproject.toml b/pyproject.toml index 28ae232..9e5d96f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,8 @@ dev = [ "tifffile>=2024.2.12", "xmltodict", "licensecheck", - "ms-peak-picker" + "ms-peak-picker", + "hdbscan" ] # potentially relevant in the future again: diff --git a/src/depiction_targeted_preproc/example/run.py b/src/depiction_targeted_preproc/example/run.py index ddcc114..64e0882 100644 --- a/src/depiction_targeted_preproc/example/run.py +++ b/src/depiction_targeted_preproc/example/run.py @@ -31,6 +31,7 @@ "cluster_default_kmeans.hdf5", "cluster_default_stats_kmeans.csv", "cluster_default_kmeans.png", + "cluster_default_hdbscan.png", ], } diff --git a/src/depiction_targeted_preproc/example_compare/run_compare.py b/src/depiction_targeted_preproc/example_compare/run_compare.py index 32f18b1..11a8748 100644 --- a/src/depiction_targeted_preproc/example_compare/run_compare.py +++ b/src/depiction_targeted_preproc/example_compare/run_compare.py @@ -17,8 +17,7 @@ def prepare_tasks(input_imzml_path: Path, work_dir: Path) -> list[Path]: requested_files = get_all_output_files(folders) combined_dir = work_dir / input_imzml_path.stem - exp_files = [combined_dir / "exp_compare_cluster_stats.pdf", - combined_dir / "exp_plot_compare_peak_density.pdf"] + exp_files = [combined_dir / "exp_compare_cluster_stats.pdf", combined_dir / "exp_plot_compare_peak_density.pdf"] return requested_files + exp_files diff --git a/src/depiction_targeted_preproc/workflow/exp/plot_compare_peak_density.py b/src/depiction_targeted_preproc/workflow/exp/plot_compare_peak_density.py index 6f0a436..5f3b665 100644 --- a/src/depiction_targeted_preproc/workflow/exp/plot_compare_peak_density.py +++ b/src/depiction_targeted_preproc/workflow/exp/plot_compare_peak_density.py @@ -17,8 +17,11 @@ def exp_plot_compare_peak_density( vegafusion.enable() table = pl.concat( - [pl.read_parquet(path).with_columns(variant=pl.lit(path.parents[1].name)) for path in - tables_marker_distances_calib]) + [ + pl.read_parquet(path).with_columns(variant=pl.lit(path.parents[1].name)) + for path in tables_marker_distances_calib + ] + ) table = pl.concat([table, pl.read_parquet(table_marker_distance_uncalib).with_columns(variant=pl.lit("uncalib"))]) plot_density_combined_full(df_peak_dist=table, out_pdf=output_pdf) diff --git a/src/depiction_targeted_preproc/workflow/experimental.smk b/src/depiction_targeted_preproc/workflow/experimental.smk index 1980553..976cc06 100644 --- a/src/depiction_targeted_preproc/workflow/experimental.smk +++ b/src/depiction_targeted_preproc/workflow/experimental.smk @@ -9,7 +9,7 @@ exp_variants = ["chem_noise", "mass_cluster", "reg_shift"] rule exp_compare_cluster_stats: input: - csv=expand("{{sample}}/{exp_variant}/cluster_default_stats_kmeans.csv", exp_variant=exp_variants) + csv=expand("{{sample}}/{exp_variant}/cluster_default_stats_hdbscan.csv", exp_variant=exp_variants) output: pdf="{sample}/exp_compare_cluster_stats.pdf" shell: diff --git a/src/depiction_targeted_preproc/workflow/proc/cluster_hdbscan.py b/src/depiction_targeted_preproc/workflow/proc/cluster_hdbscan.py new file mode 100644 index 0000000..343edc7 --- /dev/null +++ b/src/depiction_targeted_preproc/workflow/proc/cluster_hdbscan.py @@ -0,0 +1,49 @@ +from pathlib import Path +from typing import Annotated + +import math +import numpy as np +import typer +import xarray +from hdbscan.flat import (HDBSCAN_flat) +from sklearn.preprocessing import StandardScaler +from typer import Option + +from depiction.image.multi_channel_image import MultiChannelImage +from depiction_targeted_preproc.workflow.proc.cluster_kmeans import retain_strongest_signals + + +def cluster_dbscan(input_netcdf_path: Annotated[Path, Option()], output_netcdf_path: Annotated[Path, Option()]) -> None: + image = MultiChannelImage.read_hdf5(input_netcdf_path) + # TODO make configurable + n_clusters = 10 + n_features = 50 + + reduced_data = retain_strongest_signals(image.data_flat.transpose("i", "c"), n_features) + + scaler = StandardScaler() + scaler.fit(reduced_data.values) + + # kmeans = BisectingKMeans(n_clusters=n_clusters) + # dbscan = (eps=0.3, min_samples=10) + data_scaled = scaler.transform(reduced_data.values) + + # clusterer = hdbscan.HDBSCAN() + # clusterer.fit(data_scaled) + # clusters = clusterer.labels_ + + clusterer = HDBSCAN_flat(data_scaled, + cluster_selection_method='eom', + n_clusters=10, + min_cluster_size=math.ceil(0.02*data_scaled.shape[0])) + clusters = clusterer.labels_ + + cluster_data = xarray.DataArray(clusters, dims=("i",), coords={"i": image.data_flat.coords["i"]}).expand_dims("c") + cluster_data.coords["c"] = ["cluster"] + cluster_data.attrs["bg_value"] = np.nan + cluster_image = MultiChannelImage(cluster_data.unstack("i")) + cluster_image.write_hdf5(output_netcdf_path) + + +if __name__ == "__main__": + typer.run(cluster_dbscan) diff --git a/src/depiction_targeted_preproc/workflow/proc/cluster_kmeans.py b/src/depiction_targeted_preproc/workflow/proc/cluster_kmeans.py index efd20d9..5070eaa 100644 --- a/src/depiction_targeted_preproc/workflow/proc/cluster_kmeans.py +++ b/src/depiction_targeted_preproc/workflow/proc/cluster_kmeans.py @@ -5,7 +5,8 @@ import typer import xarray from loguru import logger -from sklearn.cluster import KMeans +from sklearn.cluster import KMeans, BisectingKMeans +from sklearn.preprocessing import StandardScaler from typer import Option from depiction.image.multi_channel_image import MultiChannelImage @@ -31,8 +32,11 @@ def cluster_kmeans(input_netcdf_path: Annotated[Path, Option()], output_netcdf_p reduced_data = retain_strongest_signals(image.data_flat.transpose("i", "c"), n_features) - kmeans = KMeans(n_clusters=n_clusters) - clusters = kmeans.fit_predict(reduced_data.values) + scaler = StandardScaler() + scaler.fit(reduced_data.values) + + kmeans = BisectingKMeans(n_clusters=n_clusters) + clusters = kmeans.fit_predict(scaler.transform(reduced_data.values)) cluster_data = xarray.DataArray(clusters, dims=("i",), coords={"i": image.data_flat.coords["i"]}).expand_dims("c") cluster_data.coords["c"] = ["cluster"] diff --git a/src/depiction_targeted_preproc/workflow/proc/cluster_stats.py b/src/depiction_targeted_preproc/workflow/proc/cluster_stats.py index f314af9..bfcb155 100644 --- a/src/depiction_targeted_preproc/workflow/proc/cluster_stats.py +++ b/src/depiction_targeted_preproc/workflow/proc/cluster_stats.py @@ -5,12 +5,31 @@ import polars as pl import typer from loguru import logger +from sklearn.metrics import calinski_harabasz_score +from sklearn.metrics import silhouette_score, davies_bouldin_score from typer import Option from depiction.image.multi_channel_image import MultiChannelImage from depiction_targeted_preproc.workflow.proc.__cluster_stats import compute_CHAOS, compute_PAS +# from fastdtw import fastdtw + + +def compute_silhouette(cluster_data, cluster_coords): + # higher is better + return silhouette_score(cluster_coords, cluster_data) + + +def compute_davies_bouldin(cluster_data, cluster_coords): + # higher is worse + return davies_bouldin_score(cluster_coords, cluster_data) + + +def compute_calinski_harabasz(X, labels): + return calinski_harabasz_score(X, labels) + + def cluster_stats(input_netcdf_path: Annotated[Path, Option()], output_csv_path: Annotated[Path, Option()]) -> None: cluster_image = MultiChannelImage.read_hdf5(input_netcdf_path) @@ -28,7 +47,19 @@ def cluster_stats(input_netcdf_path: Annotated[Path, Option()], output_csv_path: pas = compute_PAS(cluster_data, cluster_coords) logger.info(f"Computed PAS: {pas}") - metrics_df = pl.DataFrame({"metric": ["CHAOS", "PAS"], "value": [chaos, pas]}) + silhouette = compute_silhouette(cluster_data, cluster_coords) + logger.info(f"Computed silhouette: {silhouette}") + + davies_bouldin = compute_davies_bouldin(cluster_data, cluster_coords) + logger.info(f"Computed Davies-Bouldin: {davies_bouldin}") + + calinski_harabasz = compute_calinski_harabasz(cluster_coords, cluster_data) + logger.info(f"Computed Calinski-Harabasz: {calinski_harabasz}") + + metrics_df = pl.DataFrame( + {"metric": ["CHAOS", "PAS", "Silhouette", "Davies-Bouldin", "Calinski-Harabasz"], + "value": [chaos, pas, silhouette, davies_bouldin, calinski_harabasz]} + ) metrics_df.write_csv(output_csv_path) diff --git a/src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py b/src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py index 82b27fb..994a51d 100644 --- a/src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py +++ b/src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py @@ -42,7 +42,7 @@ def plot_density_combined_full(df_peak_dist: pl.DataFrame, out_pdf: Path) -> Non collect = [] for variant in variants: df_variant = df_peak_dist.filter(variant=variant) - dist, density = FFTKDE(bw="ISJ").fit(df_variant["dist"].to_numpy()).evaluate(2 ** 10) + dist, density = FFTKDE(bw="ISJ").fit(df_variant["dist"].to_numpy()).evaluate(2**10) collect.append(pl.DataFrame({"dist": list(dist), "density": list(density), "variant": variant})) dist_min = df_peak_dist["dist"].min() @@ -50,17 +50,12 @@ def plot_density_combined_full(df_peak_dist: pl.DataFrame, out_pdf: Path) -> Non df_density = pl.concat(collect).filter((pl.col("dist") >= dist_min) & (pl.col("dist") <= dist_max)) chart = ( - ( - alt.Chart(df_density) - .mark_line() - .encode(x="dist:Q", y="density:Q", color="variant:N") - .properties(width=500, height=300) - ) - .properties(title="Linear scale") - ) - chart = chart.properties( - title=f"Density of target-surrounding peak distances (N={len(df_peak_dist):,})" - ) + alt.Chart(df_density) + .mark_line() + .encode(x="dist:Q", y="density:Q", color="variant:N") + .properties(width=500, height=300) + ).properties(title="Linear scale") + chart = chart.properties(title=f"Density of target-surrounding peak distances (N={len(df_peak_dist):,})") chart.save(out_pdf) diff --git a/src/depiction_targeted_preproc/workflow/rules/rules_proc.smk b/src/depiction_targeted_preproc/workflow/rules/rules_proc.smk index ce820b2..6bdfaa9 100644 --- a/src/depiction_targeted_preproc/workflow/rules/rules_proc.smk +++ b/src/depiction_targeted_preproc/workflow/rules/rules_proc.smk @@ -68,11 +68,20 @@ rule proc_cluster_kmeans: "python -m depiction_targeted_preproc.workflow.proc.cluster_kmeans " " --input-netcdf-path {input.netcdf} --output-netcdf-path {output.netcdf}" -rule proc_cluster_stats_kmeans: +rule proc_cluster_hdbscan: input: - netcdf="{sample}/cluster_default_kmeans.hdf5" + netcdf="{sample}/images_default.hdf5" + output: + netcdf="{sample}/cluster_default_hdbscan.hdf5" + shell: + "python -m depiction_targeted_preproc.workflow.proc.cluster_hdbscan " + " --input-netcdf-path {input.netcdf} --output-netcdf-path {output.netcdf}" + +rule proc_cluster_stats: + input: + netcdf="{sample}/cluster_default_{variant}.hdf5" output: - csv="{sample}/cluster_default_stats_kmeans.csv" + csv="{sample}/cluster_default_stats_{variant}.csv" shell: "python -m depiction_targeted_preproc.workflow.proc.cluster_stats" " --input-netcdf-path {input.netcdf} --output-csv-path {output.csv}" diff --git a/src/depiction_targeted_preproc/workflow/vis/clustering.py b/src/depiction_targeted_preproc/workflow/vis/clustering.py index 43a6580..0937f92 100644 --- a/src/depiction_targeted_preproc/workflow/vis/clustering.py +++ b/src/depiction_targeted_preproc/workflow/vis/clustering.py @@ -1,6 +1,7 @@ from pathlib import Path from typing import Annotated +import numpy as np import typer import xarray from matplotlib import pyplot as plt @@ -14,6 +15,9 @@ def vis_clustering(input_netcdf_path: Annotated[Path, Option()], output_png_path source_image.plot(cmap="tab10", ax=ax) ax.set_aspect("equal") + #n_classes = len(set(source_image.values.ravel())) + n_classes = len(np.unique(source_image.values)) + fig.suptitle(f"Clustering with {n_classes} classes") fig.tight_layout() fig.savefig(output_png_path)