Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
wolski committed Jun 5, 2024
2 parents e3d2215 + dc6103d commit abcbd90
Show file tree
Hide file tree
Showing 9 changed files with 52 additions and 33 deletions.
13 changes: 9 additions & 4 deletions src/depiction/calibration/spectrum/calibration_method_mcc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@
from depiction.calibration.calibration_method import CalibrationMethod
from depiction.calibration.spectrum.calibration_smoothing import smooth_image_features

# rename to Mass Cluster Center Model MCCM
class CalibrationMethodMCC(CalibrationMethod):

class CalibrationMethodMassClusterCenterModel(CalibrationMethod):
"""Implements the Mass Cluster Center Model (MCCM) calibration method as described in the paper by
Wolski, W.E., Farrow, M., Emde, AK. et al. Analytical model of peptide mass cluster centres with applications.
Proteome Sci 4, 18 (2006). https://doi.org/10.1186/1477-5956-4-18
"""

def __init__(
self,
model_smoothing_activated: bool = True,
model_smoothing_activated: bool,
model_smoothing_kernel_size: int = 27,
model_smoothing_kernel_std: float = 10.0,
max_pairwise_distance: float = 500,
Expand All @@ -39,7 +44,7 @@ def extract_spectrum_features(self, peak_mz_arr: NDArray[float], peak_int_arr: N
slope = results.params[0]
peak_mz_corrected = peak_mz_arr * (1 - slope)
delta_intercept = self.compute_distance_from_MCC(peak_mz_corrected, l_none)
intercept_coef = scipy.stats.trim_mean(delta_intercept, 0.3)
intercept_coef = scipy.stats.trim_mean(delta_intercept, 0.25)
return DataArray([intercept_coef, slope], dims=["c"])

def compute_distance_from_MCC(self, delta: NDArray[float], l_none = 1.000482) -> NDArray[float]:
Expand Down
2 changes: 1 addition & 1 deletion src/depiction/parallel_ops/parallel_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class ParallelConfig:
"""

n_jobs: int
task_size: int | None
task_size: int | None = None
verbose: int = 1

def get_splits_count(self, n_items: int) -> int:
Expand Down
23 changes: 15 additions & 8 deletions src/depiction_targeted_preproc/app/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
export_results,
RESULT_FILE_MAPPING,
)
from loguru import logger

from depiction.misc.find_file_util import find_one_by_extension
from depiction_targeted_preproc.pipeline_config.model import (
Expand Down Expand Up @@ -134,23 +135,29 @@ def parse_parameters(yaml_file: Path) -> PipelineParameters:
preset = PipelineParametersPreset.validate(yaml.safe_load(preset_path.read_text()))

# Add n_jobs and requested_artifacts information to build a PipelineParameters
# TODO passing this as strings is technically not correct, but it's the only way that currently works
# when writing the yaml
requested_artifacts = []
if data["application"]["parameters"]["output_activate_calibrated_imzml"]:
if parse_app_boolean_parameter(data, "output_activate_calibrated_imzml"):
requested_artifacts.append(PipelineArtifact.CALIB_IMZML)
# requested_artifacts.append("CALIB_IMZML")
if data["application"]["parameters"]["output_activate_calibrated_ometiff"]:
if parse_app_boolean_parameter(data, "output_activate_calibrated_ometiff"):
requested_artifacts.append(PipelineArtifact.CALIB_IMAGES)
# requested_artifacts.append("CALIB_IMAGES")
if data["application"]["parameters"]["output_activate_calibration_qc"]:
if parse_app_boolean_parameter(data, "output_activate_calibration_qc"):
requested_artifacts.append(PipelineArtifact.CALIB_QC)
# requested_artifacts.append("CALIB_QC")
return PipelineParameters.from_preset_and_settings(
preset=preset, requested_artifacts=requested_artifacts, n_jobs=32
)


def parse_app_boolean_parameter(data: dict, key: str) -> bool:
str_value = data["application"]["parameters"][key]
if str_value == "true":
return True
elif str_value == "false":
return False
else:
logger.warning(f"Unknown boolean value: {str_value}")
return False


def main() -> None:
"""Provides the CLI around `entrypoint`."""
typer.run(entrypoint)
Expand Down
4 changes: 2 additions & 2 deletions src/depiction_targeted_preproc/pipeline_config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class CalibrationRegressShift(BaseModel):
reg_model_type: str
# TODO make explicit
reg_model_unit: str
input_smoothing_activated: bool = True
input_smoothing_activated: bool
input_smoothing_kernel_size: int = 27
input_smoothing_kernel_std: float = 10.0
min_points: int = 3
Expand All @@ -58,7 +58,7 @@ class CalibrationChemicalPeptideNoise(BaseModel):
class CalibrationMCC(BaseModel):
calibration_method: Literal["MCC"]

model_smoothing_activated: bool = True
model_smoothing_activated: bool
model_smoothing_kernel_size: int = 27
model_smoothing_kernel_std: float = 10.0

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from pathlib import Path
from typing import Annotated
import typer
from loguru import logger

from depiction.spectrum.baseline.tophat_baseline import TophatBaseline
from depiction.parallel_ops import ParallelConfig
from depiction.persistence import ImzmlReadFile, ImzmlWriteFile
Expand All @@ -18,7 +20,7 @@ def proc_adjust_baseline(
config = PipelineParameters.parse_yaml(config_path)
match config.baseline_adjustment:
case None:
print("Baseline adjustment is deactivated")
logger.info("Baseline adjustment is deactivated")
shutil.copy(input_imzml_path, output_imzml_path)
shutil.copy(input_imzml_path.with_suffix(".ibd"), output_imzml_path.with_suffix(".ibd"))
case BaselineAdjustmentTophat(window_size=window_size, window_unit=window_unit):
Expand Down
7 changes: 4 additions & 3 deletions src/depiction_targeted_preproc/workflow/proc/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@

import polars as pl
import typer
from loguru import logger

from depiction.calibration.perform_calibration import PerformCalibration
from depiction.calibration.spectrum.calibration_method_chemical_peptide_noise import (
CalibrationMethodChemicalPeptideNoise,
)
from depiction.calibration.spectrum.calibration_method_global_constant_shift import CalibrationMethodGlobalConstantShift
from depiction.calibration.spectrum.calibration_method_mcc import CalibrationMethodMCC
from depiction.calibration.spectrum.calibration_method_mcc import CalibrationMethodMassClusterCenterModel
from depiction.calibration.spectrum.calibration_method_regress_shift import CalibrationMethodRegressShift
from depiction.parallel_ops import ParallelConfig
from depiction.persistence import ImzmlReadFile, ImzmlWriteFile, ImzmlModeEnum
Expand Down Expand Up @@ -44,7 +45,7 @@ def get_calibration_from_config(mass_list: pl.DataFrame, config: PipelineParamet
use_ppm_space=calib_config.use_ppm_space,
)
case model.CalibrationMCC() as calib_config:
return CalibrationMethodMCC(
return CalibrationMethodMassClusterCenterModel(
model_smoothing_activated=calib_config.model_smoothing_activated,
model_smoothing_kernel_size=calib_config.model_smoothing_kernel_size,
model_smoothing_kernel_std=calib_config.model_smoothing_kernel_std,
Expand All @@ -65,7 +66,7 @@ def proc_calibrate(
parallel_config = ParallelConfig(n_jobs=config.n_jobs, task_size=None)
match config.calibration:
case None:
print("No calibration requested")
logger.info("No calibration requested")
shutil.copy(input_imzml_path, output_imzml_path)
shutil.copy(input_imzml_path.with_suffix(".ibd"), output_imzml_path.with_suffix(".ibd"))
case (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,11 @@ def calibrate_make_fair_comparison(
parallel_config = ParallelConfig(n_jobs=config.n_jobs, task_size=None)

# Determine if we should simply copy the file or if we should perform some GlobalConstantShift calibration
if isinstance(config.calibration, model.CalibrationRegressShift) or config is None:
if isinstance(config.calibration, model.CalibrationRegressShift) or config.calibration is None:
# no additional calibration requested, simply copy the file
logger.info("Copying input file to output")
shutil.copy(input_imzml_path, output_imzml_path)
shutil.copy(input_imzml_path.with_suffix(".ibd"), output_imzml_path.with_suffix(".ibd"))
else:
logger.info("Performing GlobalConstantShift calibration")
calibration = CalibrationMethodGlobalConstantShift(ref_mz_arr=mass_list["mass"].to_numpy())
Expand Down
4 changes: 3 additions & 1 deletion src/depiction_targeted_preproc/workflow/proc/pick_peaks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from typing import Annotated

import typer
from loguru import logger

from depiction.parallel_ops import ParallelConfig
from depiction.spectrum.peak_filtering import FilterNHighestIntensityPartitioned
from depiction.spectrum.peak_picking import BasicInterpolatedPeakPicker
Expand All @@ -21,7 +23,7 @@ def proc_pick_peaks(
config = PipelineParameters.parse_yaml(config_path)
match config.peak_picker:
case None:
print("Peak picking is deactivated")
logger.info("Peak picking is deactivated")
shutil.copy(input_imzml_path, output_imzml_path)
shutil.copy(input_imzml_path.with_suffix(".ibd"), output_imzml_path.with_suffix(".ibd"))
case model.PeakPickerBasicInterpolated() as peak_picker_config:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from collections import defaultdict
from pathlib import Path
from typing import Sequence, Annotated

Expand All @@ -9,30 +8,32 @@

from depiction.persistence import ImzmlReadFile


# TODO maybe rename to marker_surroundings


def get_peak_dist(
read_file: ImzmlReadFile, mz_targets: Sequence[float], mz_labels: Sequence[str], mz_max_dist: float
) -> pl.DataFrame:
collect = defaultdict(list)
collect = []
with read_file.get_reader() as reader:
for i_spectrum in tqdm(range(reader.n_spectra)):
spectrum_mz = reader.get_spectrum_mz(i_spectrum)
for mz_target, label in zip(mz_targets, mz_labels):
peak_mz = reader.get_spectrum_mz(i_spectrum)
dist = peak_mz - mz_target
dist = spectrum_mz - mz_target
# keep only the distances within max dist
sel = np.abs(dist) < mz_max_dist
dist = dist[sel]
peak_mz = peak_mz[sel]
marker_mz = spectrum_mz[sel]
if len(dist):
# add to result set
collect["i_spectrum"].append(i_spectrum)
collect["dist"].append(dist)
collect["mz_peak"].append(peak_mz)
collect["mz_target"].append(mz_target)
collect["label"].append(label)
return pl.DataFrame(collect).explode(["dist", "mz_peak"])
collect.append({
"i_spectrum": i_spectrum,
"dist": list(dist),
"mz_peak": list(marker_mz),
"mz_target": mz_target,
"label": label,
})
return pl.from_dicts(collect).explode(["dist", "mz_peak"])


def qc_table_marker_distances(
Expand Down

0 comments on commit abcbd90

Please sign in to comment.