Skip to content

Commit

Permalink
Merge pull request #195 from HelikarLab/feature/add-boundary-column
Browse files Browse the repository at this point in the history
Add `boundary` column
  • Loading branch information
JoshLoecker authored Dec 9, 2024
2 parents 2889016 + f56263e commit 4c1f20c
Show file tree
Hide file tree
Showing 18 changed files with 2,088 additions and 758 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/continuous_integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
uses: actions/checkout@v4

- name: Install uv
uses: astral-sh/setup-uv@v3
uses: astral-sh/setup-uv@v4

- name: Create Virtual Environment
run: uv venv
Expand All @@ -26,17 +26,17 @@ jobs:
run: uv run jupyter nbconvert --clear-output --inplace "main/COMO.ipynb"

- name: Format Python Imports
uses: astral-sh/ruff-action@v1
uses: astral-sh/ruff-action@v2
with:
args: "check --fix --select I"

- name: Format code
uses: astral-sh/ruff-action@v1
uses: astral-sh/ruff-action@v2
with:
args: "format"

- name: Format Notebook
uses: astral-sh/ruff-action@v1
uses: astral-sh/ruff-action@v2
with:
args: "format main/COMO.ipynb"

Expand All @@ -54,7 +54,7 @@ jobs:
uses: actions/checkout@v4

- name: Check Lint
uses: astral-sh/ruff-action@v1
uses: astral-sh/ruff-action@v2
with:
args: "check --no-fix --verbose"

Expand Down
10 changes: 4 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/opensource-nepal/commitlint
rev: v1.2.0
- repo: https://github.com/commitizen-tools/commitizen
rev: master
hooks:
- id: commitlint
name: Commit Lint
- id: commitizen
stages: [ commit-msg ]
321 changes: 124 additions & 197 deletions main/COMO.ipynb

Large diffs are not rendered by default.

9 changes: 5 additions & 4 deletions main/como/create_context_specific_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,7 @@ def _build_model( # noqa: C901
)


def _create_df(path: Path) -> pd.DataFrame:
def _create_df_from_file(path: Path) -> pd.DataFrame:
match path.suffix:
case ".csv":
df = pd.read_csv(path, header=0, sep=",")
Expand All @@ -547,9 +547,10 @@ def _create_df(path: Path) -> pd.DataFrame:


def _collect_boundary_reactions(path: Path) -> _BoundaryReactions:
df = _create_df(path)
df = _create_df_from_file(path)
for column in df.columns:
if column not in [
"boundary",
"reaction",
"abbreviation",
"compartment",
Expand Down Expand Up @@ -642,15 +643,15 @@ def create_context_specific_model( # noqa: C901
exclude_rxns: list[str] = []
if exclude_rxns_filepath:
exclude_rxns_filepath: Path = Path(exclude_rxns_filepath)
df = _create_df(exclude_rxns_filepath)
df = _create_df_from_file(exclude_rxns_filepath)
if "abbreviation" not in df.columns:
raise ValueError("The exclude reactions file should have a single column with a header named Abbreviation")
exclude_rxns = df["abbreviation"].tolist()

force_rxns: list[str] = []
if force_rxns_filepath:
force_rxns_filepath: Path = Path(force_rxns_filepath)
df = _create_df(force_rxns_filepath)
df = _create_df_from_file(force_rxns_filepath)
if "abbreviation" not in df.columns:
raise ValueError("The force reactions file should have a single column with a header named Abbreviation")
force_rxns = df["abbreviation"].tolist()
Expand Down
23 changes: 0 additions & 23 deletions main/como/custom_types.py

This file was deleted.

22 changes: 8 additions & 14 deletions main/como/merge_xomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from como import proteomics_gen, return_placeholder_data
from como.combine_distributions import _combine_zscores
from como.custom_types import RNASeqPreparationMethod
from como.custom_types import RNAPrepMethod
from como.project import Config
from como.utils import split_gene_expression_data

Expand Down Expand Up @@ -93,7 +93,7 @@ def __post_init__(self):
raise ValueError("Adjust method must be either 'progressive', 'regressive', 'flat', or 'custom'")


def _load_rnaseq_tests(filename, context_name, prep_method: RNASeqPreparationMethod) -> tuple[str, pd.DataFrame]:
def _load_rnaseq_tests(filename, context_name, prep_method: RNAPrepMethod) -> tuple[str, pd.DataFrame]:
"""Load rnaseq results.
Returns a dictionary of test (context, context, cell, etc ) names and rnaseq expression data
Expand All @@ -112,11 +112,11 @@ def load_dummy_dict():
raise FileNotFoundError(f"Error: Config file not found at {inquiry_full_path}")

match prep_method:
case RNASeqPreparationMethod.TOTAL:
case RNAPrepMethod.TOTAL:
filename = f"rnaseq_total_{context_name}.csv"
case RNASeqPreparationMethod.MRNA:
case RNAPrepMethod.MRNA:
filename = f"rnaseq_mrna_{context_name}.csv"
case RNASeqPreparationMethod.SCRNA:
case RNAPrepMethod.SCRNA:
filename = f"rnaseq_scrna_{context_name}.csv"
case _:
raise ValueError(
Expand Down Expand Up @@ -344,15 +344,9 @@ async def _merge_xomics(
config = Config()
logger.info(f"Merging data for {context_name}")
# load data for each source if it exists. IF not load an empty dummy dataset
trnaseq = _load_rnaseq_tests(
filename=trnaseq_file, context_name=context_name, prep_method=RNASeqPreparationMethod.TOTAL
)
mrnaseq = _load_rnaseq_tests(
filename=mrnaseq_file, context_name=context_name, prep_method=RNASeqPreparationMethod.MRNA
)
scrnaseq = _load_rnaseq_tests(
filename=scrnaseq_file, context_name=context_name, prep_method=RNASeqPreparationMethod.SCRNA
)
trnaseq = _load_rnaseq_tests(filename=trnaseq_file, context_name=context_name, prep_method=RNAPrepMethod.TOTAL)
mrnaseq = _load_rnaseq_tests(filename=mrnaseq_file, context_name=context_name, prep_method=RNAPrepMethod.MRNA)
scrnaseq = _load_rnaseq_tests(filename=scrnaseq_file, context_name=context_name, prep_method=RNAPrepMethod.SCRNA)
proteomics = proteomics_gen.load_proteomics_tests(filename=proteomics_file, context_name=context_name)

expression_list = []
Expand Down
39 changes: 20 additions & 19 deletions main/como/rnaseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@
from scipy.signal import find_peaks
from sklearn.neighbors import KernelDensity

from como.custom_types import RNASeqPreparationMethod
from como.migrations import gene_info_migrations
from como.project import Config
from como.types import RNAPrepMethod
from como.utils import convert_gene_data


Expand Down Expand Up @@ -525,20 +524,17 @@ def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics:

def cpm_filter(
*,
context_name: str,
metrics: NamedMetrics,
filtering_options: _FilteringOptions,
prep: RNASeqPreparationMethod,
output_csv_filepath: Path,
) -> NamedMetrics:
"""Apply Counts Per Million (CPM) filtering to the count matrix for a given sample."""
config = Config()
n_exp = filtering_options.replicate_ratio
n_top = filtering_options.high_replicate_ratio
cut_off = filtering_options.cut_off

sample: str
metric: _StudyMetrics
for sample, metric in metrics.items():
for metric in metrics.values():
counts: pd.DataFrame = metric.count_matrix
entrez_ids: list[str] = metric.entrez_gene_ids
library_size: pd.DataFrame = counts.sum(axis=1)
Expand All @@ -548,12 +544,11 @@ def cpm_filter(
# thus, (0 / 1) * 1_000_000 = 0
library_size[library_size == 0] = 1

output_filepath = config.result_dir / context_name / prep.value / f"CPM_Matrix_{prep.value}_{sample}.csv"
output_filepath.parent.mkdir(parents=True, exist_ok=True)
output_csv_filepath.parent.mkdir(parents=True, exist_ok=True)
counts_per_million: pd.DataFrame = (counts / library_size) * 1_000_000
counts_per_million.insert(0, "entrez_gene_ids", pd.Series(entrez_ids))
logger.debug(f"Writing CPM matrix to {output_filepath}")
counts_per_million.to_csv(output_filepath, index=False)
logger.debug(f"Writing CPM matrix to {output_csv_filepath}")
counts_per_million.to_csv(output_csv_filepath, index=False)

# TODO: Counts per million is adding ~61,500 columns (equal to the number of genes) for some reason.
# Most likely due to multiplying by 1_000_000, not exactly sure why
Expand Down Expand Up @@ -656,24 +651,31 @@ def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions,

def filter_counts(
*,
context_name: str,
metrics: NamedMetrics,
technique: FilteringTechnique,
filtering_options: _FilteringOptions,
prep: RNASeqPreparationMethod,
cpm_output_filepath: Path | None = None,
) -> NamedMetrics:
"""Filter the count matrix based on the specified technique."""
match technique:
case FilteringTechnique.cpm:
if cpm_output_filepath is None:
raise ValueError("CPM output filepath must be provided")
return cpm_filter(
context_name=context_name, metrics=metrics, filtering_options=filtering_options, prep=prep
metrics=metrics,
filtering_options=filtering_options,
output_csv_filepath=cpm_output_filepath,
)

case FilteringTechnique.tpm:
return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options)

case FilteringTechnique.zfpkm:
return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=True)

case FilteringTechnique.umi:
return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=False)

case _:
raise ValueError(f"Technique must be one of {FilteringTechnique}")

Expand All @@ -684,7 +686,7 @@ async def save_rnaseq_tests(
config_filepath: Path,
gene_info_filepath: Path,
output_filepath: Path,
prep: RNASeqPreparationMethod,
prep: RNAPrepMethod,
taxon_id: Taxon,
replicate_ratio: float,
batch_ratio: float,
Expand All @@ -702,7 +704,7 @@ async def save_rnaseq_tests(
high_batch_ratio=high_batch_ratio,
)

if prep == RNASeqPreparationMethod.SCRNA:
if prep == RNAPrepMethod.SCRNA:
technique = FilteringTechnique.umi
logger.warning(
"Single cell filtration does not normalize and assumes "
Expand All @@ -721,11 +723,9 @@ async def save_rnaseq_tests(
entrez_gene_ids = read_counts_results.entrez_gene_ids

metrics = filter_counts(
context_name=context_name,
metrics=metrics,
technique=technique,
filtering_options=filtering_options,
prep=prep,
)

expressed_genes: list[str] = []
Expand Down Expand Up @@ -758,6 +758,7 @@ async def save_rnaseq_tests(

boolean_matrix.to_csv(output_filepath, index=False)
logger.info(
f"{context_name} - Found {expressed_count} expressed and {high_confidence_count} confidently expressed genes"
f"{context_name} - Found {expressed_count} expressed genes, "
f"{high_confidence_count} of which are confidently expressed"
)
logger.success(f"Wrote boolean matrix to {output_filepath}")
14 changes: 7 additions & 7 deletions main/como/rnaseq_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from loguru import logger

from como import Config
from como.custom_types import RNASeqPreparationMethod
from como.custom_types import RNAPrepMethod
from como.rnaseq import FilteringTechnique, save_rnaseq_tests


Expand All @@ -22,11 +22,11 @@ class _Arguments:
high_batch_ratio: float
filtering_technique: FilteringTechnique
minimum_cutoff: int | str
library_prep: RNASeqPreparationMethod
library_prep: RNAPrepMethod
taxon: Taxon

def __post_init__(self):
self.library_prep = RNASeqPreparationMethod.from_string(str(self.library_prep))
self.library_prep = RNAPrepMethod.from_string(str(self.library_prep))
self.filtering_technique = FilteringTechnique.from_string(str(self.filtering_technique))

if self.minimum_cutoff is None:
Expand All @@ -46,7 +46,7 @@ async def _handle_context_batch(
batch_ratio_high: float,
technique: FilteringTechnique,
cut_off: int | float | str,
prep: RNASeqPreparationMethod,
prep: RNAPrepMethod,
taxon: Taxon,
) -> None:
"""Iterate through each context type and create rnaseq expression file.
Expand Down Expand Up @@ -81,9 +81,9 @@ async def _handle_context_batch(
rnaseq_input_filepath = (
config.data_dir / "data_matrices" / context_name / f"gene_counts_matrix_{prep.value}_{context_name}"
)
if prep == RNASeqPreparationMethod.SCRNA:
if prep == RNAPrepMethod.SCRNA:
rnaseq_input_filepath = rnaseq_input_filepath.with_suffix(".h5ad")
elif prep in {RNASeqPreparationMethod.TOTAL, RNASeqPreparationMethod.MRNA}:
elif prep in {RNAPrepMethod.TOTAL, RNAPrepMethod.MRNA}:
rnaseq_input_filepath = rnaseq_input_filepath.with_suffix(".csv")

if not rnaseq_input_filepath.exists():
Expand Down Expand Up @@ -117,7 +117,7 @@ async def _handle_context_batch(
async def rnaseq_gen(
# config_filepath: Path,
config_filename: str,
prep: RNASeqPreparationMethod,
prep: RNAPrepMethod,
taxon_id: int | str | Taxon,
replicate_ratio: float = 0.5,
high_replicate_ratio: float = 1.0,
Expand Down
Loading

0 comments on commit 4c1f20c

Please sign in to comment.