From c586cf8128289509a7c935a616754d1cc112b3be Mon Sep 17 00:00:00 2001 From: Leonardo Schwarz Date: Tue, 12 Nov 2024 12:40:05 +0100 Subject: [PATCH] add new code to standardize input panel --- .../panel/__init__.py | 0 .../panel/standardize_input_panel.py | 44 +++++++++++++++++++ .../test_standardize_input_panel.py | 23 ++++++++++ 3 files changed, 67 insertions(+) create mode 100644 src/depiction_targeted_preproc/panel/__init__.py create mode 100644 src/depiction_targeted_preproc/panel/standardize_input_panel.py create mode 100644 tests/unit/to_be_migrated/test_standardize_input_panel.py diff --git a/src/depiction_targeted_preproc/panel/__init__.py b/src/depiction_targeted_preproc/panel/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/depiction_targeted_preproc/panel/standardize_input_panel.py b/src/depiction_targeted_preproc/panel/standardize_input_panel.py new file mode 100644 index 0000000..0e07409 --- /dev/null +++ b/src/depiction_targeted_preproc/panel/standardize_input_panel.py @@ -0,0 +1,44 @@ +import polars as pl +from pydantic import BaseModel + + +class StandardizeConfig(BaseModel): + column_names: dict[str, set[str]] = { + "mass": {"m/z", "mass", "pc-mt (m+h)+"}, + "label": {"marker", "label"}, + "type": {"type"}, + } + select_columns: list[str] = ["mass", "label", "type"] + default_values: dict[str, str] = {"type": "target"} + + +def _identify_column_correspondence(config: StandardizeConfig, raw_df: pl.DataFrame) -> dict[str, str]: + identified_columns = {} + for column_name in raw_df.columns: + for key, values in config.column_names.items(): + if column_name.lower() in values: + if key not in identified_columns: + identified_columns[key] = column_name + else: + raise ValueError( + f"Column {column_name} is ambiguous, it could be {key} or {identified_columns[key]}" + ) + required_columns = set(config.select_columns) - set(config.default_values.keys()) + missing_columns = required_columns - set(identified_columns.keys()) + if missing_columns: + raise ValueError(f"Missing columns: {missing_columns}") + # reverse the mapping + return {original: target for target, original in identified_columns.items()} + + +def standardize(config: StandardizeConfig, raw_df: pl.DataFrame): + column_correspondence = _identify_column_correspondence(config=config, raw_df=raw_df) + renamed_df = raw_df.select(column_correspondence.keys()).rename(column_correspondence) + full_df = renamed_df.with_columns( + **{ + column: pl.lit(config.default_values[column]) + for column in config.default_values + if column not in renamed_df.columns + } + ) + return full_df.select(config.select_columns) diff --git a/tests/unit/to_be_migrated/test_standardize_input_panel.py b/tests/unit/to_be_migrated/test_standardize_input_panel.py new file mode 100644 index 0000000..f8ba904 --- /dev/null +++ b/tests/unit/to_be_migrated/test_standardize_input_panel.py @@ -0,0 +1,23 @@ +import polars as pl +import polars.testing +import pytest + +from depiction_targeted_preproc.panel.standardize_input_panel import StandardizeConfig, standardize + + +@pytest.fixture +def config() -> StandardizeConfig: + return StandardizeConfig( + column_names={"mass": {"m/z"}, "label": {"label", "x"}}, + select_columns=["mass", "label", "type"], + default_values={"type": "something"}, + ) + + +def test_standardize(config: StandardizeConfig) -> None: + raw_df = pl.DataFrame({"m/z": [1, 2, 3], "x": ["a", "b", "c"]}) + result = standardize(config=config, raw_df=raw_df) + expected_df = pl.DataFrame( + {"mass": [1, 2, 3], "label": ["a", "b", "c"], "type": ["something", "something", "something"]} + ) + pl.testing.assert_frame_equal(result, expected_df)