diff --git a/src/depiction_targeted_preproc/workflow/exp/plot_compare_peak_density.py b/src/depiction_targeted_preproc/workflow/exp/plot_compare_peak_density.py index aaba086..6f0a436 100644 --- a/src/depiction_targeted_preproc/workflow/exp/plot_compare_peak_density.py +++ b/src/depiction_targeted_preproc/workflow/exp/plot_compare_peak_density.py @@ -6,11 +6,12 @@ import vegafusion from typer import Option, Argument -from depiction_targeted_preproc.workflow.qc.plot_peak_density import plot_density_combined +from depiction_targeted_preproc.workflow.qc.plot_peak_density import plot_density_combined_full def exp_plot_compare_peak_density( tables_marker_distances_calib: Annotated[list[Path], Argument()], + table_marker_distance_uncalib: Annotated[Path, Option()], output_pdf: Annotated[Path, Option()], ) -> None: vegafusion.enable() @@ -18,8 +19,9 @@ def exp_plot_compare_peak_density( table = pl.concat( [pl.read_parquet(path).with_columns(variant=pl.lit(path.parents[1].name)) for path in tables_marker_distances_calib]) + table = pl.concat([table, pl.read_parquet(table_marker_distance_uncalib).with_columns(variant=pl.lit("uncalib"))]) - plot_density_combined(df_peak_dist=table, out_pdf=output_pdf) + plot_density_combined_full(df_peak_dist=table, out_pdf=output_pdf) if __name__ == "__main__": diff --git a/src/depiction_targeted_preproc/workflow/experimental.smk b/src/depiction_targeted_preproc/workflow/experimental.smk index ed90050..1980553 100644 --- a/src/depiction_targeted_preproc/workflow/experimental.smk +++ b/src/depiction_targeted_preproc/workflow/experimental.smk @@ -34,10 +34,12 @@ rule exp_mass_list_preparation: rule exp_plot_compare_peak_density: input: - tables_marker_distance=expand("{{sample}}/{exp_variant}/qc/table_marker_distances_calib.parquet", exp_variant=exp_variants) + tables_marker_distance=expand("{{sample}}/{exp_variant}/qc/table_marker_distances_calib.parquet", exp_variant=exp_variants), + table_marker_distance_uncalib="{sample}/reg_shift/qc/table_marker_distances_baseline.parquet", output: pdf="{sample}/exp_plot_compare_peak_density.pdf" shell: "python -m depiction_targeted_preproc.workflow.exp.plot_compare_peak_density" " {input.tables_marker_distance}" + " --table-marker-distance-uncalib {input.table_marker_distance_uncalib}" " --output-pdf {output.pdf}" \ No newline at end of file diff --git a/src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py b/src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py index de5f6b9..82b27fb 100644 --- a/src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py +++ b/src/depiction_targeted_preproc/workflow/qc/plot_peak_density.py @@ -5,6 +5,7 @@ import polars as pl import typer import vegafusion +from KDEpy import FFTKDE from typer import Option from depiction_targeted_preproc.workflow.qc.plot_calibration_map import get_mass_groups @@ -15,27 +16,57 @@ def subsample_dataframe(df: pl.DataFrame) -> pl.DataFrame: return df.sample(n_samples, seed=1, shuffle=True) -def plot_density_combined(df_peak_dist: pl.DataFrame, out_pdf: Path) -> None: - n_tot = len(df_peak_dist) - df_peak_dist = subsample_dataframe(df_peak_dist) +# def plot_density_combined(df_peak_dist: pl.DataFrame, out_pdf: Path) -> None: +# n_tot = len(df_peak_dist) +# df_peak_dist = subsample_dataframe(df_peak_dist) +# chart = ( +# ( +# alt.Chart(df_peak_dist) +# .mark_line() +# .transform_density( +# density="dist", as_=["dist", "density"], groupby=["variant"], maxsteps=250, bandwidth=0.01 +# ) +# .encode(x="dist:Q", color="variant:N") +# .properties(width=500, height=300) +# ) +# .encode(y=alt.Y("density:Q")) +# .properties(title="Linear scale") +# ) +# chart = chart.properties( +# title=f"Density of target-surrounding peak distances (n_tot = {n_tot} sampled to n = {len(df_peak_dist)})") +# chart.save(out_pdf) + + +def plot_density_combined_full(df_peak_dist: pl.DataFrame, out_pdf: Path) -> None: + variants = df_peak_dist["variant"].unique().to_list() + collect = [] + for variant in variants: + df_variant = df_peak_dist.filter(variant=variant) + dist, density = FFTKDE(bw="ISJ").fit(df_variant["dist"].to_numpy()).evaluate(2 ** 10) + collect.append(pl.DataFrame({"dist": list(dist), "density": list(density), "variant": variant})) + + dist_min = df_peak_dist["dist"].min() + dist_max = df_peak_dist["dist"].max() + df_density = pl.concat(collect).filter((pl.col("dist") >= dist_min) & (pl.col("dist") <= dist_max)) + chart = ( ( - alt.Chart(df_peak_dist) + alt.Chart(df_density) .mark_line() - .transform_density( - density="dist", as_=["dist", "density"], groupby=["variant"], maxsteps=250, bandwidth=0.01 - ) - .encode(x="dist:Q", color="variant:N") + .encode(x="dist:Q", y="density:Q", color="variant:N") .properties(width=500, height=300) ) - .encode(y=alt.Y("density:Q")) .properties(title="Linear scale") ) - chart = chart.properties(title=f"Density of target-surrounding peak distances (n_tot = {n_tot} sampled to n = {len(df_peak_dist)})") + chart = chart.properties( + title=f"Density of target-surrounding peak distances (N={len(df_peak_dist):,})" + ) chart.save(out_pdf) def plot_density_groups(df_peak_dist: pl.DataFrame, mass_groups: pl.DataFrame, out_peak_density_ranges: Path) -> None: + # TODO use kdepy as above + # TODO merge these two functions df_peak_dist_grouped = df_peak_dist.sort("mz_target").join_asof( mass_groups, left_on="mz_target", right_on="mz_min", strategy="backward" @@ -81,7 +112,7 @@ def qc_plot_peak_density( out_peak_density_ranges=output_pdf, ) else: - plot_density_combined(df_peak_dist=table, out_pdf=output_pdf) + plot_density_combined_full(df_peak_dist=table, out_pdf=output_pdf) if __name__ == "__main__":