Skip to content

Commit

Permalink
add new code to standardize input panel
Browse files Browse the repository at this point in the history
  • Loading branch information
leoschwarz committed Nov 12, 2024
1 parent 5a98fd4 commit c586cf8
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 0 deletions.
Empty file.
44 changes: 44 additions & 0 deletions src/depiction_targeted_preproc/panel/standardize_input_panel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import polars as pl
from pydantic import BaseModel


class StandardizeConfig(BaseModel):
column_names: dict[str, set[str]] = {
"mass": {"m/z", "mass", "pc-mt (m+h)+"},
"label": {"marker", "label"},
"type": {"type"},
}
select_columns: list[str] = ["mass", "label", "type"]
default_values: dict[str, str] = {"type": "target"}


def _identify_column_correspondence(config: StandardizeConfig, raw_df: pl.DataFrame) -> dict[str, str]:
identified_columns = {}
for column_name in raw_df.columns:
for key, values in config.column_names.items():
if column_name.lower() in values:
if key not in identified_columns:
identified_columns[key] = column_name
else:
raise ValueError(
f"Column {column_name} is ambiguous, it could be {key} or {identified_columns[key]}"
)
required_columns = set(config.select_columns) - set(config.default_values.keys())
missing_columns = required_columns - set(identified_columns.keys())
if missing_columns:
raise ValueError(f"Missing columns: {missing_columns}")
# reverse the mapping
return {original: target for target, original in identified_columns.items()}


def standardize(config: StandardizeConfig, raw_df: pl.DataFrame):
column_correspondence = _identify_column_correspondence(config=config, raw_df=raw_df)
renamed_df = raw_df.select(column_correspondence.keys()).rename(column_correspondence)
full_df = renamed_df.with_columns(
**{
column: pl.lit(config.default_values[column])
for column in config.default_values
if column not in renamed_df.columns
}
)
return full_df.select(config.select_columns)
23 changes: 23 additions & 0 deletions tests/unit/to_be_migrated/test_standardize_input_panel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import polars as pl
import polars.testing
import pytest

from depiction_targeted_preproc.panel.standardize_input_panel import StandardizeConfig, standardize


@pytest.fixture
def config() -> StandardizeConfig:
return StandardizeConfig(
column_names={"mass": {"m/z"}, "label": {"label", "x"}},
select_columns=["mass", "label", "type"],
default_values={"type": "something"},
)


def test_standardize(config: StandardizeConfig) -> None:
raw_df = pl.DataFrame({"m/z": [1, 2, 3], "x": ["a", "b", "c"]})
result = standardize(config=config, raw_df=raw_df)
expected_df = pl.DataFrame(
{"mass": [1, 2, 3], "label": ["a", "b", "c"], "type": ["something", "something", "something"]}
)
pl.testing.assert_frame_equal(result, expected_df)

0 comments on commit c586cf8

Please sign in to comment.