diff --git a/src/depiction_targeted_preproc/workflow/prepare_pipeline/standardize_table.py b/src/depiction_targeted_preproc/workflow/prepare_pipeline/standardize_table.py deleted file mode 100644 index b771732..0000000 --- a/src/depiction_targeted_preproc/workflow/prepare_pipeline/standardize_table.py +++ /dev/null @@ -1,31 +0,0 @@ -from pathlib import Path - -import polars as pl - - -def standardize_table(input_df: pl.DataFrame) -> pl.DataFrame: - # TODO this is a total hack for a quick setup - mapping = {} - for column in input_df.columns: - if column.lower() in ["marker", "label"]: - mapping[column] = "label" - elif column.lower() in ["mass", "m/z", "pc-mt (m+h)+"]: - mapping[column] = "mass" - elif column.lower() in ["tol"]: - mapping[column] = "tol" - output_df = input_df.rename(mapping) - - if "tol" not in output_df: - # TODO make configurable - output_df = output_df.with_columns([pl.Series("tol", [0.2] * len(output_df))]) - return output_df - - -def copy_standardized_table(input_csv: Path, output_csv: Path): - input_df = pl.read_csv(input_csv) - write_standardized_table(input_df, output_csv) - - -def write_standardized_table(input_df: pl.DataFrame, output_csv: Path) -> None: - output_df = standardize_table(input_df) - output_df.write_csv(output_csv) diff --git a/src/depiction_targeted_preproc/workflow/proc/mass_list_preparation.py b/src/depiction_targeted_preproc/workflow/proc/mass_list_preparation.py deleted file mode 100644 index 9dce722..0000000 --- a/src/depiction_targeted_preproc/workflow/proc/mass_list_preparation.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import annotations - -from pathlib import Path - -import cyclopts -import polars as pl - -app = cyclopts.App() - -COLUMN_NAMES = { - "mass": {"m/z", "mass", "pc-mt (m+h)+"}, - "label": {"marker", "label"}, - "tol": {"tol", "tolerance"}, -} - - -def identify_column_correspondence(raw_df: pl.DataFrame) -> dict[str, str]: - identified_columns = {} - for column_name in raw_df.columns: - for key, values in COLUMN_NAMES.items(): - if column_name.lower() in values: - if key not in identified_columns: - identified_columns[key] = column_name - else: - raise ValueError( - f"Column {column_name} is ambiguous, it could be {key} or {identified_columns[key]}" - ) - required_columns = {"mass", "label"} - missing_columns = required_columns - set(identified_columns.keys()) - if missing_columns: - raise ValueError(f"Missing columns: {missing_columns}") - # reverse the mapping - return {original: target for target, original in identified_columns.items()} - - -@app.default -def mass_list_preparation( - raw_csv: Path, - out_csv: Path, -) -> None: - raw_df = pl.read_csv(raw_csv) - - # identify columns - column_correspondence = identify_column_correspondence(raw_df) - - # rename columns (and drop the rest) - renamed = ( - raw_df.select(column_correspondence.values()) - .rename(column_correspondence) - .select(sorted(column_correspondence.values())) - ) - - # add tol column if not present - if "tol" not in renamed.columns: - renamed = renamed.with_column("tol", pl.Null) - - # write the results - renamed.write_csv(out_csv) - - -if __name__ == "__main__": - app()