Skip to content

Commit

Permalink
Merge pull request #193 from HelikarLab/remove-hardcoded-paths/rnaseq
Browse files Browse the repository at this point in the history
Remove hardcoded paths from rnaseq.py
  • Loading branch information
JoshLoecker authored Dec 9, 2024
2 parents 0422633 + 82044bd commit 012a1d9
Showing 1 changed file with 18 additions and 17 deletions.
35 changes: 18 additions & 17 deletions main/como/rnaseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@
from scipy.signal import find_peaks
from sklearn.neighbors import KernelDensity

from como.custom_types import RNAPrepMethod
from como.migrations import gene_info_migrations
from como.project import Config
from como.types import RNAPrepMethod
from como.utils import convert_gene_data


Expand Down Expand Up @@ -525,20 +524,17 @@ def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics:

def cpm_filter(
*,
context_name: str,
metrics: NamedMetrics,
filtering_options: _FilteringOptions,
prep: RNAPrepMethod,
output_csv_filepath: Path,
) -> NamedMetrics:
"""Apply Counts Per Million (CPM) filtering to the count matrix for a given sample."""
config = Config()
n_exp = filtering_options.replicate_ratio
n_top = filtering_options.high_replicate_ratio
cut_off = filtering_options.cut_off

sample: str
metric: _StudyMetrics
for sample, metric in metrics.items():
for metric in metrics.values():
counts: pd.DataFrame = metric.count_matrix
entrez_ids: list[str] = metric.entrez_gene_ids
library_size: pd.DataFrame = counts.sum(axis=1)
Expand All @@ -548,12 +544,11 @@ def cpm_filter(
# thus, (0 / 1) * 1_000_000 = 0
library_size[library_size == 0] = 1

output_filepath = config.result_dir / context_name / prep.value / f"CPM_Matrix_{prep.value}_{sample}.csv"
output_filepath.parent.mkdir(parents=True, exist_ok=True)
output_csv_filepath.parent.mkdir(parents=True, exist_ok=True)
counts_per_million: pd.DataFrame = (counts / library_size) * 1_000_000
counts_per_million.insert(0, "entrez_gene_ids", pd.Series(entrez_ids))
logger.debug(f"Writing CPM matrix to {output_filepath}")
counts_per_million.to_csv(output_filepath, index=False)
logger.debug(f"Writing CPM matrix to {output_csv_filepath}")
counts_per_million.to_csv(output_csv_filepath, index=False)

# TODO: Counts per million is adding ~61,500 columns (equal to the number of genes) for some reason.
# Most likely due to multiplying by 1_000_000, not exactly sure why
Expand Down Expand Up @@ -656,24 +651,31 @@ def zfpkm_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions,

def filter_counts(
*,
context_name: str,
metrics: NamedMetrics,
technique: FilteringTechnique,
filtering_options: _FilteringOptions,
prep: RNAPrepMethod,
cpm_output_filepath: Path | None = None,
) -> NamedMetrics:
"""Filter the count matrix based on the specified technique."""
match technique:
case FilteringTechnique.cpm:
if cpm_output_filepath is None:
raise ValueError("CPM output filepath must be provided")
return cpm_filter(
context_name=context_name, metrics=metrics, filtering_options=filtering_options, prep=prep
metrics=metrics,
filtering_options=filtering_options,
output_csv_filepath=cpm_output_filepath,
)

case FilteringTechnique.tpm:
return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options)

case FilteringTechnique.zfpkm:
return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=True)

case FilteringTechnique.umi:
return zfpkm_filter(metrics=metrics, filtering_options=filtering_options, calcualte_fpkm=False)

case _:
raise ValueError(f"Technique must be one of {FilteringTechnique}")

Expand Down Expand Up @@ -721,11 +723,9 @@ async def save_rnaseq_tests(
entrez_gene_ids = read_counts_results.entrez_gene_ids

metrics = filter_counts(
context_name=context_name,
metrics=metrics,
technique=technique,
filtering_options=filtering_options,
prep=prep,
)

expressed_genes: list[str] = []
Expand Down Expand Up @@ -758,6 +758,7 @@ async def save_rnaseq_tests(

boolean_matrix.to_csv(output_filepath, index=False)
logger.info(
f"{context_name} - Found {expressed_count} expressed and {high_confidence_count} confidently expressed genes"
f"{context_name} - Found {expressed_count} expressed genes, "
f"{high_confidence_count} of which are confidently expressed"
)
logger.success(f"Wrote boolean matrix to {output_filepath}")

0 comments on commit 012a1d9

Please sign in to comment.