Merge pull request #195 from HelikarLab/feature/add-boundary-column

Add `boundary` column
HelikarLab · Dec 9, 2024 · 4c1f20c · 4c1f20c
2 parents 2889016 + f56263e
commit 4c1f20c
Show file tree

Hide file tree

Showing 18 changed files with 2,088 additions and 758 deletions.
diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
@@ -14,7 +14,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install uv
-        uses: astral-sh/setup-uv@v3
+        uses: astral-sh/setup-uv@v4
 
       - name: Create Virtual Environment
         run: uv venv
@@ -26,17 +26,17 @@ jobs:
         run: uv run jupyter nbconvert --clear-output --inplace "main/COMO.ipynb"
 
       - name: Format Python Imports
-        uses: astral-sh/ruff-action@v1
+        uses: astral-sh/ruff-action@v2
         with:
           args: "check --fix --select I"
 
       - name: Format code
-        uses: astral-sh/ruff-action@v1
+        uses: astral-sh/ruff-action@v2
         with:
           args: "format"
 
       - name: Format Notebook
-        uses: astral-sh/ruff-action@v1
+        uses: astral-sh/ruff-action@v2
         with:
           args: "format main/COMO.ipynb"
 
@@ -54,7 +54,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Check Lint
-        uses: astral-sh/ruff-action@v1
+        uses: astral-sh/ruff-action@v2
         with:
           args: "check --no-fix --verbose"
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,8 +1,6 @@
-# See https://pre-commit.com for more information
-# See https://pre-commit.com/hooks.html for more hooks
 repos:
--   repo: https://github.com/opensource-nepal/commitlint
-    rev: v1.2.0
+  - repo: https://github.com/commitizen-tools/commitizen
+    rev: master
     hooks:
-    -   id: commitlint
-        name: Commit Lint
+      - id: commitizen
+        stages: [ commit-msg ]
diff --git a/main/COMO.ipynb b/main/COMO.ipynb
diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
@@ -532,7 +532,7 @@ def _build_model(  # noqa: C901
     )
 
 
-def _create_df(path: Path) -> pd.DataFrame:
+def _create_df_from_file(path: Path) -> pd.DataFrame:
     match path.suffix:
         case ".csv":
             df = pd.read_csv(path, header=0, sep=",")
@@ -547,9 +547,10 @@ def _create_df(path: Path) -> pd.DataFrame:
 
 
 def _collect_boundary_reactions(path: Path) -> _BoundaryReactions:
-    df = _create_df(path)
+    df = _create_df_from_file(path)
     for column in df.columns:
         if column not in [
+            "boundary",
             "reaction",
             "abbreviation",
             "compartment",
@@ -642,15 +643,15 @@ def create_context_specific_model(  # noqa: C901
     exclude_rxns: list[str] = []
     if exclude_rxns_filepath:
         exclude_rxns_filepath: Path = Path(exclude_rxns_filepath)
-        df = _create_df(exclude_rxns_filepath)
+        df = _create_df_from_file(exclude_rxns_filepath)
         if "abbreviation" not in df.columns:
             raise ValueError("The exclude reactions file should have a single column with a header named Abbreviation")
         exclude_rxns = df["abbreviation"].tolist()
 
     force_rxns: list[str] = []
     if force_rxns_filepath:
         force_rxns_filepath: Path = Path(force_rxns_filepath)
-        df = _create_df(force_rxns_filepath)
+        df = _create_df_from_file(force_rxns_filepath)
         if "abbreviation" not in df.columns:
             raise ValueError("The force reactions file should have a single column with a header named Abbreviation")
         force_rxns = df["abbreviation"].tolist()

diff --git a/main/como/custom_types.py b/main/como/custom_types.py
diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py
@@ -16,7 +16,7 @@
 
 from como import proteomics_gen, return_placeholder_data
 from como.combine_distributions import _combine_zscores
-from como.custom_types import RNASeqPreparationMethod
+from como.custom_types import RNAPrepMethod
 from como.project import Config
 from como.utils import split_gene_expression_data
 
@@ -93,7 +93,7 @@ def __post_init__(self):
             raise ValueError("Adjust method must be either 'progressive', 'regressive', 'flat', or 'custom'")
 
 
-def _load_rnaseq_tests(filename, context_name, prep_method: RNASeqPreparationMethod) -> tuple[str, pd.DataFrame]:
+def _load_rnaseq_tests(filename, context_name, prep_method: RNAPrepMethod) -> tuple[str, pd.DataFrame]:
     """Load rnaseq results.
 
     Returns a dictionary of test (context, context, cell, etc ) names and rnaseq expression data
@@ -112,11 +112,11 @@ def load_dummy_dict():
         raise FileNotFoundError(f"Error: Config file not found at {inquiry_full_path}")
 
     match prep_method:
-        case RNASeqPreparationMethod.TOTAL:
+        case RNAPrepMethod.TOTAL:
             filename = f"rnaseq_total_{context_name}.csv"
-        case RNASeqPreparationMethod.MRNA:
+        case RNAPrepMethod.MRNA:
             filename = f"rnaseq_mrna_{context_name}.csv"
-        case RNASeqPreparationMethod.SCRNA:
+        case RNAPrepMethod.SCRNA:
             filename = f"rnaseq_scrna_{context_name}.csv"
         case _:
             raise ValueError(
@@ -344,15 +344,9 @@ async def _merge_xomics(
     config = Config()
     logger.info(f"Merging data for {context_name}")
     # load data for each source if it exists. IF not load an empty dummy dataset
-    trnaseq = _load_rnaseq_tests(
-        filename=trnaseq_file, context_name=context_name, prep_method=RNASeqPreparationMethod.TOTAL
-    )
-    mrnaseq = _load_rnaseq_tests(
-        filename=mrnaseq_file, context_name=context_name, prep_method=RNASeqPreparationMethod.MRNA
-    )
-    scrnaseq = _load_rnaseq_tests(
-        filename=scrnaseq_file, context_name=context_name, prep_method=RNASeqPreparationMethod.SCRNA
-    )
+    trnaseq = _load_rnaseq_tests(filename=trnaseq_file, context_name=context_name, prep_method=RNAPrepMethod.TOTAL)
+    mrnaseq = _load_rnaseq_tests(filename=mrnaseq_file, context_name=context_name, prep_method=RNAPrepMethod.MRNA)
+    scrnaseq = _load_rnaseq_tests(filename=scrnaseq_file, context_name=context_name, prep_method=RNAPrepMethod.SCRNA)
     proteomics = proteomics_gen.load_proteomics_tests(filename=proteomics_file, context_name=context_name)
 
     expression_list = []

diff --git a/main/como/rnaseq.py b/main/como/rnaseq.py
@@ -27,9 +27,8 @@
 from scipy.signal import find_peaks
 from sklearn.neighbors import KernelDensity
 
-from como.custom_types import RNASeqPreparationMethod
 from como.migrations import gene_info_migrations
-from como.project import Config
+from como.types import RNAPrepMethod
 from como.utils import convert_gene_data
 
 
@@ -525,20 +524,17 @@ def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics:
 
 def cpm_filter(
     *,
-    context_name: str,
     metrics: NamedMetrics,
     filtering_options: _FilteringOptions,
-    prep: RNASeqPreparationMethod,
+    output_csv_filepath: Path,
 ) -> NamedMetrics:
     """Apply Counts Per Million (CPM) filtering to the count matrix for a given sample."""
-    config = Config()
     n_exp = filtering_options.replicate_ratio
     n_top = filtering_options.high_replicate_ratio
     cut_off = filtering_options.cut_off
 
-    sample: str
     metric: _StudyMetrics
-    for sample, metric in metrics.items():
+    for metric in metrics.values():
         counts: pd.DataFrame = metric.count_matrix
         entrez_ids: list[str] = metric.entrez_gene_ids
         library_size: pd.DataFrame = counts.sum(axis=1)
@@ -548,12 +544,11 @@ def cpm_filter(
         #   thus, (0 / 1) * 1_000_000 = 0
         library_size[library_size == 0] = 1
 
-        output_filepath = config.result_dir / context_name / prep.value / f"CPM_Matrix_{prep.value}_{sample}.csv"
-        output_filepath.parent.mkdir(parents=True, exist_ok=True)
+        output_csv_filepath.parent.mkdir(parents=True, exist_ok=True)
         counts_per_million: pd.DataFrame = (counts / library_size) * 1_000_000
         counts_per_million.insert(0, "entrez_gene_ids", pd.Series(entrez_ids))
-        logger.debug(f"Writing CPM matrix to {output_filepath}")
-        counts_per_million.to_csv(output_filepath, index=False)
+        logger.debug(f"Writing CPM matrix to {output_csv_filepath}")
+        counts_per_million.to_csv(output_csv_filepath, index=False)
 
         # TODO: Counts per million is adding ~61,500 columns (equal to the number of genes) for some reason.
         #  Most likely due to multiplying by 1_000_000, not exactly sure why
@@ -656,24 +651,31 @@ def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions,
 
 def filter_counts(
     *,
-    context_name: str,
     metrics: NamedMetrics,
     technique: FilteringTechnique,
     filtering_options: _FilteringOptions,
-    prep: RNASeqPreparationMethod,
+    cpm_output_filepath: Path | None = None,
 ) -> NamedMetrics:
     """Filter the count matrix based on the specified technique."""
     match technique:
         case FilteringTechnique.cpm:
+            if cpm_output_filepath is None:
+                raise ValueError("CPM output filepath must be provided")
             return cpm_filter(
-                context_name=context_name, metrics=metrics, filtering_options=filtering_options, prep=prep
+                metrics=metrics,
+                filtering_options=filtering_options,
+                output_csv_filepath=cpm_output_filepath,
             )
+
         case FilteringTechnique.tpm:
             return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options)
+
         case FilteringTechnique.zfpkm:
             return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=True)
+
         case FilteringTechnique.umi:
             return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=False)
+
         case _:
             raise ValueError(f"Technique must be one of {FilteringTechnique}")
 
@@ -684,7 +686,7 @@ async def save_rnaseq_tests(
     config_filepath: Path,
     gene_info_filepath: Path,
     output_filepath: Path,
-    prep: RNASeqPreparationMethod,
+    prep: RNAPrepMethod,
     taxon_id: Taxon,
     replicate_ratio: float,
     batch_ratio: float,
@@ -702,7 +704,7 @@ async def save_rnaseq_tests(
         high_batch_ratio=high_batch_ratio,
     )
 
-    if prep == RNASeqPreparationMethod.SCRNA:
+    if prep == RNAPrepMethod.SCRNA:
         technique = FilteringTechnique.umi
         logger.warning(
             "Single cell filtration does not normalize and assumes "
@@ -721,11 +723,9 @@ async def save_rnaseq_tests(
     entrez_gene_ids = read_counts_results.entrez_gene_ids
 
     metrics = filter_counts(
-        context_name=context_name,
         metrics=metrics,
         technique=technique,
         filtering_options=filtering_options,
-        prep=prep,
     )
 
     expressed_genes: list[str] = []
@@ -758,6 +758,7 @@ async def save_rnaseq_tests(
 
     boolean_matrix.to_csv(output_filepath, index=False)
     logger.info(
-        f"{context_name} - Found {expressed_count} expressed and {high_confidence_count} confidently expressed genes"
+        f"{context_name} - Found {expressed_count} expressed genes, "
+        f"{high_confidence_count} of which are confidently expressed"
     )
     logger.success(f"Wrote boolean matrix to {output_filepath}")
diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
@@ -9,7 +9,7 @@
 from loguru import logger
 
 from como import Config
-from como.custom_types import RNASeqPreparationMethod
+from como.custom_types import RNAPrepMethod
 from como.rnaseq import FilteringTechnique, save_rnaseq_tests
 
 
@@ -22,11 +22,11 @@ class _Arguments:
     high_batch_ratio: float
     filtering_technique: FilteringTechnique
     minimum_cutoff: int | str
-    library_prep: RNASeqPreparationMethod
+    library_prep: RNAPrepMethod
     taxon: Taxon
 
     def __post_init__(self):
-        self.library_prep = RNASeqPreparationMethod.from_string(str(self.library_prep))
+        self.library_prep = RNAPrepMethod.from_string(str(self.library_prep))
         self.filtering_technique = FilteringTechnique.from_string(str(self.filtering_technique))
 
         if self.minimum_cutoff is None:
@@ -46,7 +46,7 @@ async def _handle_context_batch(
     batch_ratio_high: float,
     technique: FilteringTechnique,
     cut_off: int | float | str,
-    prep: RNASeqPreparationMethod,
+    prep: RNAPrepMethod,
     taxon: Taxon,
 ) -> None:
     """Iterate through each context type and create rnaseq expression file.
@@ -81,9 +81,9 @@ async def _handle_context_batch(
         rnaseq_input_filepath = (
             config.data_dir / "data_matrices" / context_name / f"gene_counts_matrix_{prep.value}_{context_name}"
         )
-        if prep == RNASeqPreparationMethod.SCRNA:
+        if prep == RNAPrepMethod.SCRNA:
             rnaseq_input_filepath = rnaseq_input_filepath.with_suffix(".h5ad")
-        elif prep in {RNASeqPreparationMethod.TOTAL, RNASeqPreparationMethod.MRNA}:
+        elif prep in {RNAPrepMethod.TOTAL, RNAPrepMethod.MRNA}:
             rnaseq_input_filepath = rnaseq_input_filepath.with_suffix(".csv")
 
         if not rnaseq_input_filepath.exists():
@@ -117,7 +117,7 @@ async def _handle_context_batch(
 async def rnaseq_gen(
     # config_filepath: Path,
     config_filename: str,
-    prep: RNASeqPreparationMethod,
+    prep: RNAPrepMethod,
     taxon_id: int | str | Taxon,
     replicate_ratio: float = 0.5,
     high_replicate_ratio: float = 1.0,