Skip to content

Commit

Permalink
also add hdbscan as an alternative approach
Browse files Browse the repository at this point in the history
  • Loading branch information
leoschwarz committed Jun 7, 2024
1 parent de1ba04 commit 1024e79
Show file tree
Hide file tree
Showing 11 changed files with 121 additions and 25 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ dev = [
"tifffile>=2024.2.12",
"xmltodict",
"licensecheck",
"ms-peak-picker"
"ms-peak-picker",
"hdbscan"
]

# potentially relevant in the future again:
Expand Down
1 change: 1 addition & 0 deletions src/depiction_targeted_preproc/example/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"cluster_default_kmeans.hdf5",
"cluster_default_stats_kmeans.csv",
"cluster_default_kmeans.png",
"cluster_default_hdbscan.png",
],
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ def prepare_tasks(input_imzml_path: Path, work_dir: Path) -> list[Path]:
requested_files = get_all_output_files(folders)

combined_dir = work_dir / input_imzml_path.stem
exp_files = [combined_dir / "exp_compare_cluster_stats.pdf",
combined_dir / "exp_plot_compare_peak_density.pdf"]
exp_files = [combined_dir / "exp_compare_cluster_stats.pdf", combined_dir / "exp_plot_compare_peak_density.pdf"]
return requested_files + exp_files


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@ def exp_plot_compare_peak_density(
vegafusion.enable()

table = pl.concat(
[pl.read_parquet(path).with_columns(variant=pl.lit(path.parents[1].name)) for path in
tables_marker_distances_calib])
[
pl.read_parquet(path).with_columns(variant=pl.lit(path.parents[1].name))
for path in tables_marker_distances_calib
]
)
table = pl.concat([table, pl.read_parquet(table_marker_distance_uncalib).with_columns(variant=pl.lit("uncalib"))])

plot_density_combined_full(df_peak_dist=table, out_pdf=output_pdf)
Expand Down
2 changes: 1 addition & 1 deletion src/depiction_targeted_preproc/workflow/experimental.smk
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ exp_variants = ["chem_noise", "mass_cluster", "reg_shift"]

rule exp_compare_cluster_stats:
input:
csv=expand("{{sample}}/{exp_variant}/cluster_default_stats_kmeans.csv", exp_variant=exp_variants)
csv=expand("{{sample}}/{exp_variant}/cluster_default_stats_hdbscan.csv", exp_variant=exp_variants)
output:
pdf="{sample}/exp_compare_cluster_stats.pdf"
shell:
Expand Down
49 changes: 49 additions & 0 deletions src/depiction_targeted_preproc/workflow/proc/cluster_hdbscan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from pathlib import Path
from typing import Annotated

import math
import numpy as np
import typer
import xarray
from hdbscan.flat import (HDBSCAN_flat)
from sklearn.preprocessing import StandardScaler
from typer import Option

from depiction.image.multi_channel_image import MultiChannelImage
from depiction_targeted_preproc.workflow.proc.cluster_kmeans import retain_strongest_signals


def cluster_dbscan(input_netcdf_path: Annotated[Path, Option()], output_netcdf_path: Annotated[Path, Option()]) -> None:
image = MultiChannelImage.read_hdf5(input_netcdf_path)
# TODO make configurable
n_clusters = 10
n_features = 50

reduced_data = retain_strongest_signals(image.data_flat.transpose("i", "c"), n_features)

scaler = StandardScaler()
scaler.fit(reduced_data.values)

# kmeans = BisectingKMeans(n_clusters=n_clusters)
# dbscan = (eps=0.3, min_samples=10)
data_scaled = scaler.transform(reduced_data.values)

# clusterer = hdbscan.HDBSCAN()
# clusterer.fit(data_scaled)
# clusters = clusterer.labels_

clusterer = HDBSCAN_flat(data_scaled,
cluster_selection_method='eom',
n_clusters=10,
min_cluster_size=math.ceil(0.02*data_scaled.shape[0]))
clusters = clusterer.labels_

cluster_data = xarray.DataArray(clusters, dims=("i",), coords={"i": image.data_flat.coords["i"]}).expand_dims("c")
cluster_data.coords["c"] = ["cluster"]
cluster_data.attrs["bg_value"] = np.nan
cluster_image = MultiChannelImage(cluster_data.unstack("i"))
cluster_image.write_hdf5(output_netcdf_path)


if __name__ == "__main__":
typer.run(cluster_dbscan)
10 changes: 7 additions & 3 deletions src/depiction_targeted_preproc/workflow/proc/cluster_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import typer
import xarray
from loguru import logger
from sklearn.cluster import KMeans
from sklearn.cluster import KMeans, BisectingKMeans
from sklearn.preprocessing import StandardScaler
from typer import Option

from depiction.image.multi_channel_image import MultiChannelImage
Expand All @@ -31,8 +32,11 @@ def cluster_kmeans(input_netcdf_path: Annotated[Path, Option()], output_netcdf_p

reduced_data = retain_strongest_signals(image.data_flat.transpose("i", "c"), n_features)

kmeans = KMeans(n_clusters=n_clusters)
clusters = kmeans.fit_predict(reduced_data.values)
scaler = StandardScaler()
scaler.fit(reduced_data.values)

kmeans = BisectingKMeans(n_clusters=n_clusters)
clusters = kmeans.fit_predict(scaler.transform(reduced_data.values))

cluster_data = xarray.DataArray(clusters, dims=("i",), coords={"i": image.data_flat.coords["i"]}).expand_dims("c")
cluster_data.coords["c"] = ["cluster"]
Expand Down
33 changes: 32 additions & 1 deletion src/depiction_targeted_preproc/workflow/proc/cluster_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,31 @@
import polars as pl
import typer
from loguru import logger
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import silhouette_score, davies_bouldin_score
from typer import Option

from depiction.image.multi_channel_image import MultiChannelImage
from depiction_targeted_preproc.workflow.proc.__cluster_stats import compute_CHAOS, compute_PAS


# from fastdtw import fastdtw


def compute_silhouette(cluster_data, cluster_coords):
# higher is better
return silhouette_score(cluster_coords, cluster_data)


def compute_davies_bouldin(cluster_data, cluster_coords):
# higher is worse
return davies_bouldin_score(cluster_coords, cluster_data)


def compute_calinski_harabasz(X, labels):
return calinski_harabasz_score(X, labels)


def cluster_stats(input_netcdf_path: Annotated[Path, Option()], output_csv_path: Annotated[Path, Option()]) -> None:
cluster_image = MultiChannelImage.read_hdf5(input_netcdf_path)

Expand All @@ -28,7 +47,19 @@ def cluster_stats(input_netcdf_path: Annotated[Path, Option()], output_csv_path:
pas = compute_PAS(cluster_data, cluster_coords)
logger.info(f"Computed PAS: {pas}")

metrics_df = pl.DataFrame({"metric": ["CHAOS", "PAS"], "value": [chaos, pas]})
silhouette = compute_silhouette(cluster_data, cluster_coords)
logger.info(f"Computed silhouette: {silhouette}")

davies_bouldin = compute_davies_bouldin(cluster_data, cluster_coords)
logger.info(f"Computed Davies-Bouldin: {davies_bouldin}")

calinski_harabasz = compute_calinski_harabasz(cluster_coords, cluster_data)
logger.info(f"Computed Calinski-Harabasz: {calinski_harabasz}")

metrics_df = pl.DataFrame(
{"metric": ["CHAOS", "PAS", "Silhouette", "Davies-Bouldin", "Calinski-Harabasz"],
"value": [chaos, pas, silhouette, davies_bouldin, calinski_harabasz]}
)
metrics_df.write_csv(output_csv_path)


Expand Down
19 changes: 7 additions & 12 deletions src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,25 +42,20 @@ def plot_density_combined_full(df_peak_dist: pl.DataFrame, out_pdf: Path) -> Non
collect = []
for variant in variants:
df_variant = df_peak_dist.filter(variant=variant)
dist, density = FFTKDE(bw="ISJ").fit(df_variant["dist"].to_numpy()).evaluate(2 ** 10)
dist, density = FFTKDE(bw="ISJ").fit(df_variant["dist"].to_numpy()).evaluate(2**10)
collect.append(pl.DataFrame({"dist": list(dist), "density": list(density), "variant": variant}))

dist_min = df_peak_dist["dist"].min()
dist_max = df_peak_dist["dist"].max()
df_density = pl.concat(collect).filter((pl.col("dist") >= dist_min) & (pl.col("dist") <= dist_max))

chart = (
(
alt.Chart(df_density)
.mark_line()
.encode(x="dist:Q", y="density:Q", color="variant:N")
.properties(width=500, height=300)
)
.properties(title="Linear scale")
)
chart = chart.properties(
title=f"Density of target-surrounding peak distances (N={len(df_peak_dist):,})"
)
alt.Chart(df_density)
.mark_line()
.encode(x="dist:Q", y="density:Q", color="variant:N")
.properties(width=500, height=300)
).properties(title="Linear scale")
chart = chart.properties(title=f"Density of target-surrounding peak distances (N={len(df_peak_dist):,})")
chart.save(out_pdf)


Expand Down
15 changes: 12 additions & 3 deletions src/depiction_targeted_preproc/workflow/rules/rules_proc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,20 @@ rule proc_cluster_kmeans:
"python -m depiction_targeted_preproc.workflow.proc.cluster_kmeans "
" --input-netcdf-path {input.netcdf} --output-netcdf-path {output.netcdf}"

rule proc_cluster_stats_kmeans:
rule proc_cluster_hdbscan:
input:
netcdf="{sample}/cluster_default_kmeans.hdf5"
netcdf="{sample}/images_default.hdf5"
output:
netcdf="{sample}/cluster_default_hdbscan.hdf5"
shell:
"python -m depiction_targeted_preproc.workflow.proc.cluster_hdbscan "
" --input-netcdf-path {input.netcdf} --output-netcdf-path {output.netcdf}"

rule proc_cluster_stats:
input:
netcdf="{sample}/cluster_default_{variant}.hdf5"
output:
csv="{sample}/cluster_default_stats_kmeans.csv"
csv="{sample}/cluster_default_stats_{variant}.csv"
shell:
"python -m depiction_targeted_preproc.workflow.proc.cluster_stats"
" --input-netcdf-path {input.netcdf} --output-csv-path {output.csv}"
4 changes: 4 additions & 0 deletions src/depiction_targeted_preproc/workflow/vis/clustering.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
from typing import Annotated

import numpy as np
import typer
import xarray
from matplotlib import pyplot as plt
Expand All @@ -14,6 +15,9 @@ def vis_clustering(input_netcdf_path: Annotated[Path, Option()], output_png_path
source_image.plot(cmap="tab10", ax=ax)
ax.set_aspect("equal")

#n_classes = len(set(source_image.values.ravel()))
n_classes = len(np.unique(source_image.values))
fig.suptitle(f"Clustering with {n_classes} classes")
fig.tight_layout()
fig.savefig(output_png_path)

Expand Down

0 comments on commit 1024e79

Please sign in to comment.