From 9ef3d466c7282355b087d5569c87e08ea5e54e78 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:11:22 -0500 Subject: [PATCH 01/14] Add options to filter CDS by genes and add padding --- gnomad/utils/filtering.py | 36 ++++++++++++++++++++++++++++++----- init_scripts/master-init.sh | 2 +- init_scripts/sparklyr-init.sh | 2 +- init_scripts/vep81-init.sh | 4 +--- init_scripts/vep85-init.sh | 6 +----- 5 files changed, 35 insertions(+), 15 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 75aecaa60..aae35e4a4 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -6,6 +6,7 @@ from typing import Callable, Dict, List, Optional, Tuple, Union import hail as hl +from sympy import intervals import gnomad.utils.annotations as annotate_utils from gnomad.resources.resource_utils import DataException @@ -403,7 +404,10 @@ def filter_to_clinvar_pathogenic( def filter_to_gencode_cds( - t: Union[hl.MatrixTable, hl.Table], gencode_ht: Optional[hl.Table] = None + t: Union[hl.MatrixTable, hl.Table], + gencode_ht: Optional[hl.Table] = None, + genes: Optional[Union[str, List[str]]] = None, + padding: Optional[int] = 0, ) -> hl.Table: """ Filter a Table/MatrixTable to only Gencode CDS regions in protein coding transcripts. @@ -436,6 +440,8 @@ def filter_to_gencode_cds( :param gencode_ht: Gencode Table to use for filtering the input Table/MatrixTable to CDS regions. Default is None, which will use the default version of the Gencode Table resource. + :param genes: Gene(s) to filter to. Default is None, which will filter to all genes. + :param padding: Number of bases to pad the CDS intervals by. Default is 0. :return: Table/MatrixTable filtered to loci in Gencode CDS intervals. """ if gencode_ht is None: @@ -460,12 +466,32 @@ def filter_to_gencode_cds( "This Gencode CDS interval filter does not filter by transcript! Please see the" " documentation for more details to confirm it's being used as intended." ) - filter_expr = hl.is_defined(gencode_ht[t.locus]) + if genes: + genes_upper = [genes.upper()] if isinstance(genes, str) else [g.upper() for g + in genes] + gencode_ht = gencode_ht.filter( + hl.literal(genes_upper).contains(gencode_ht.gene_name)) + + if padding: + gencode_ht = gencode_ht.annotate( + padded_interval=hl.locus_interval( + gencode_ht.interval.start.contig, + gencode_ht.interval.start.position - padding, + gencode_ht.interval.end.position + padding, + includes_start=gencode_ht.interval.includes_start, + includes_end=gencode_ht.interval.includes_end, + reference_genome=gencode_ht.interval.start.dtype.reference_genome, + ) + ) + gencode_ht = gencode_ht.key_by("padded_interval") - if isinstance(t, hl.MatrixTable): - t = t.filter_rows(filter_expr) + if genes or padding: + # Only collect intervals if filtering by genes or padding + intervals_expr = gencode_ht.padded_interval if padding else gencode_ht.interval + cds_intervals = intervals_expr.collect() + t = hl.filter_intervals(t, cds_intervals ) else: - t = t.filter(filter_expr) + t = t.filter_rows(hl.is_defined(gencode_ht[t.locus])) if isinstance(t, hl.MatrixTable) else t.filter(hl.is_defined(gencode_ht[t.locus])) return t diff --git a/init_scripts/master-init.sh b/init_scripts/master-init.sh index d777f5986..2291eb4e3 100644 --- a/init_scripts/master-init.sh +++ b/init_scripts/master-init.sh @@ -23,4 +23,4 @@ if [[ "${ROLE}" == 'Master' ]]; then ./gnomad-init.sh > gnomad_startup.log 2>&1 & ./sparklyr-init.sh > sparklyr_startup.log 2>&1 & -fi \ No newline at end of file +fi diff --git a/init_scripts/sparklyr-init.sh b/init_scripts/sparklyr-init.sh index 1ded18d4e..fae1f7472 100644 --- a/init_scripts/sparklyr-init.sh +++ b/init_scripts/sparklyr-init.sh @@ -24,4 +24,4 @@ ln -s /usr/lib/spark /opt/spark/spark-2.0.2-bin-hadoop2.7 R --vanilla -e "install.packages(c('sparklyr', 'dplyr'), repos='https://cran.rstudio.com')" R --vanilla -e "install.packages(c('magrittr', 'ggplot2', 'slackr', 'ggrepel'), repos='https://cran.rstudio.com')" R --vanilla -e "install.packages(c('plyr', 'shiny', 'plotly'), repos='https://cran.rstudio.com')" -R --vanilla -e "install.packages(c('DT', 'tidyverse', 'broom', 'randomForest', 'ROCR', 'shinythemes', 'devtools'), repos='https://cran.rstudio.com')" \ No newline at end of file +R --vanilla -e "install.packages(c('DT', 'tidyverse', 'broom', 'randomForest', 'ROCR', 'shinythemes', 'devtools'), repos='https://cran.rstudio.com')" diff --git a/init_scripts/vep81-init.sh b/init_scripts/vep81-init.sh index b8dd4f5e3..b05a529c5 100644 --- a/init_scripts/vep81-init.sh +++ b/init_scripts/vep81-init.sh @@ -2,7 +2,7 @@ # Copy VEP -mkdir -p /vep/homo_sapiens +mkdir -p /vep/homo_sapiens gsutil -m cp -r gs://hail-common/vep/vep/loftee /vep gsutil -m cp -r gs://hail-common/vep/vep/ensembl-tools-release-81 /vep gsutil -m cp -r gs://hail-common/vep/vep/loftee_data /vep @@ -36,5 +36,3 @@ gsutil cp gs://hail-common/vep/vep/run_hail_vep_vcf.sh /vep chmod a+rx /vep/run_hail_vep_vcf.sh /vep/run_hail_vep_vcf.sh /vep/1var.vcf - - diff --git a/init_scripts/vep85-init.sh b/init_scripts/vep85-init.sh index 10d6af717..bfac0a35b 100644 --- a/init_scripts/vep85-init.sh +++ b/init_scripts/vep85-init.sh @@ -2,7 +2,7 @@ # Copy VEP -mkdir -p /vep/homo_sapiens +mkdir -p /vep/homo_sapiens gsutil -m cp -r gs://hail-common/vep/vep/loftee /vep gsutil -m cp -r gs://hail-common/vep/vep/ensembl-tools-release-85 /vep gsutil -m cp -r gs://hail-common/vep/vep/loftee_data /vep @@ -40,7 +40,3 @@ gsutil cp gs://hail-common/vep/vep/run_hail_vep_vcf.sh /vep chmod a+rx /vep/run_hail_vep_vcf.sh /vep/run_hail_vep_vcf.sh /vep/1var.vcf - - - - From 4897380341121d2eb8cfc4f2388c493d64344ccb Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:11:40 -0500 Subject: [PATCH 02/14] Add options to filter CDS by genes and add padding --- gnomad/utils/filtering.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index aae35e4a4..2ddb3a373 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -467,10 +467,12 @@ def filter_to_gencode_cds( " documentation for more details to confirm it's being used as intended." ) if genes: - genes_upper = [genes.upper()] if isinstance(genes, str) else [g.upper() for g - in genes] + genes_upper = ( + [genes.upper()] if isinstance(genes, str) else [g.upper() for g in genes] + ) gencode_ht = gencode_ht.filter( - hl.literal(genes_upper).contains(gencode_ht.gene_name)) + hl.literal(genes_upper).contains(gencode_ht.gene_name) + ) if padding: gencode_ht = gencode_ht.annotate( @@ -489,9 +491,13 @@ def filter_to_gencode_cds( # Only collect intervals if filtering by genes or padding intervals_expr = gencode_ht.padded_interval if padding else gencode_ht.interval cds_intervals = intervals_expr.collect() - t = hl.filter_intervals(t, cds_intervals ) + t = hl.filter_intervals(t, cds_intervals) else: - t = t.filter_rows(hl.is_defined(gencode_ht[t.locus])) if isinstance(t, hl.MatrixTable) else t.filter(hl.is_defined(gencode_ht[t.locus])) + t = ( + t.filter_rows(hl.is_defined(gencode_ht[t.locus])) + if isinstance(t, hl.MatrixTable) + else t.filter(hl.is_defined(gencode_ht[t.locus])) + ) return t From a20ad6b1c516833a99feed6f224759d4ded84e8a Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:13:08 -0500 Subject: [PATCH 03/14] Add note --- gnomad/utils/filtering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 2ddb3a373..81fe2040d 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -488,7 +488,7 @@ def filter_to_gencode_cds( gencode_ht = gencode_ht.key_by("padded_interval") if genes or padding: - # Only collect intervals if filtering by genes or padding + # Only collect intervals if filtering by genes or padding, to avoid memory issues intervals_expr = gencode_ht.padded_interval if padding else gencode_ht.interval cds_intervals = intervals_expr.collect() t = hl.filter_intervals(t, cds_intervals) From bf6a840c03003196ce7ff52d805713f3baaf5d89 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:30:27 -0500 Subject: [PATCH 04/14] autopep8 formatting --- gnomad/utils/filtering.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 81fe2040d..9142b33d1 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -488,7 +488,8 @@ def filter_to_gencode_cds( gencode_ht = gencode_ht.key_by("padded_interval") if genes or padding: - # Only collect intervals if filtering by genes or padding, to avoid memory issues + # Only collect intervals if filtering by genes or padding, to avoid memory + # issues intervals_expr = gencode_ht.padded_interval if padding else gencode_ht.interval cds_intervals = intervals_expr.collect() t = hl.filter_intervals(t, cds_intervals) From 3f1abaecd373a3d534c559e5b36bdcd3f94c62f3 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:35:16 -0500 Subject: [PATCH 05/14] Remove unused import --- gnomad/utils/filtering.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 9142b33d1..0d185facc 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -6,7 +6,6 @@ from typing import Callable, Dict, List, Optional, Tuple, Union import hail as hl -from sympy import intervals import gnomad.utils.annotations as annotate_utils from gnomad.resources.resource_utils import DataException From 426c576e594df07da1a79523aca280cf571f2339 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 20 Dec 2024 14:56:12 -0500 Subject: [PATCH 06/14] set max intervals to collect --- gnomad/utils/filtering.py | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 0d185facc..c082a6e50 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -406,7 +406,9 @@ def filter_to_gencode_cds( t: Union[hl.MatrixTable, hl.Table], gencode_ht: Optional[hl.Table] = None, genes: Optional[Union[str, List[str]]] = None, - padding: Optional[int] = 0, + by_gene_symbol: bool = True, + padding_bp: Optional[int] = 0, + max_intervals: Optional[int] = 3000, ) -> hl.Table: """ Filter a Table/MatrixTable to only Gencode CDS regions in protein coding transcripts. @@ -439,8 +441,10 @@ def filter_to_gencode_cds( :param gencode_ht: Gencode Table to use for filtering the input Table/MatrixTable to CDS regions. Default is None, which will use the default version of the Gencode Table resource. - :param genes: Gene(s) to filter to. Default is None, which will filter to all genes. - :param padding: Number of bases to pad the CDS intervals by. Default is 0. + :param genes: Optional gene(s) to filter to. Can be a single gene string or list of genes. Default is None. + :param by_gene_symbol: Whether to filter by gene symbol. Default is True. If False, will filter by gene ID. + :param padding_bp: Number of bases to pad the CDS intervals by. Default is 0. + :param max_intervals: Maximum number of intervals to collect before filtering. Default is 1000. :return: Table/MatrixTable filtered to loci in Gencode CDS intervals. """ if gencode_ht is None: @@ -469,30 +473,35 @@ def filter_to_gencode_cds( genes_upper = ( [genes.upper()] if isinstance(genes, str) else [g.upper() for g in genes] ) - gencode_ht = gencode_ht.filter( - hl.literal(genes_upper).contains(gencode_ht.gene_name) - ) + if by_gene_symbol: + gencode_ht = gencode_ht.filter( + hl.literal(genes_upper).contains(gencode_ht.gene_name) + ) + else: + gencode_ht = gencode_ht.filter( + hl.literal(genes_upper).contains(gencode_ht.gene_id) + ) - if padding: + if padding_bp: gencode_ht = gencode_ht.annotate( padded_interval=hl.locus_interval( gencode_ht.interval.start.contig, - gencode_ht.interval.start.position - padding, - gencode_ht.interval.end.position + padding, + gencode_ht.interval.start.position - padding_bp, + gencode_ht.interval.end.position + padding_bp, includes_start=gencode_ht.interval.includes_start, includes_end=gencode_ht.interval.includes_end, reference_genome=gencode_ht.interval.start.dtype.reference_genome, ) ) - gencode_ht = gencode_ht.key_by("padded_interval") - if genes or padding: - # Only collect intervals if filtering by genes or padding, to avoid memory - # issues - intervals_expr = gencode_ht.padded_interval if padding else gencode_ht.interval - cds_intervals = intervals_expr.collect() + # Collect intervals if there are more than `max_intervals` to avoid memory issues + if gencode_ht.count() <= max_intervals: + logger.info("Since %d is smaller than the max intervals that can be collected, collecting all intervals", gencode_ht.count()) + cds_intervals = gencode_ht.padded_interval if padding_bp else gencode_ht.interval + cds_intervals = cds_intervals.collect() t = hl.filter_intervals(t, cds_intervals) else: + logger.info("Filter to Gencode CDS intervals ") t = ( t.filter_rows(hl.is_defined(gencode_ht[t.locus])) if isinstance(t, hl.MatrixTable) From 1eb088f38e480f95599b03a102dcf6978c63abf6 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 20 Dec 2024 15:05:40 -0500 Subject: [PATCH 07/14] key_by new interval only needed --- gnomad/utils/filtering.py | 15 +++++++++++---- init_scripts/master-init.sh | 2 +- init_scripts/sparklyr-init.sh | 2 +- init_scripts/vep81-init.sh | 2 +- init_scripts/vep85-init.sh | 2 +- 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index c082a6e50..ec9f60dae 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -495,13 +495,20 @@ def filter_to_gencode_cds( ) # Collect intervals if there are more than `max_intervals` to avoid memory issues - if gencode_ht.count() <= max_intervals: - logger.info("Since %d is smaller than the max intervals that can be collected, collecting all intervals", gencode_ht.count()) - cds_intervals = gencode_ht.padded_interval if padding_bp else gencode_ht.interval + if gencode_ht.count() < max_intervals: + logger.info( + "Since %d is smaller than the max intervals that can be collected, collecting all intervals...", + gencode_ht.count(), + ) + cds_intervals = ( + gencode_ht.padded_interval if padding_bp else gencode_ht.interval + ) cds_intervals = cds_intervals.collect() t = hl.filter_intervals(t, cds_intervals) else: - logger.info("Filter to Gencode CDS intervals ") + if padding_bp: + gencode_ht = gencode_ht.key_by("padded_interval") + t = ( t.filter_rows(hl.is_defined(gencode_ht[t.locus])) if isinstance(t, hl.MatrixTable) diff --git a/init_scripts/master-init.sh b/init_scripts/master-init.sh index 2291eb4e3..d777f5986 100644 --- a/init_scripts/master-init.sh +++ b/init_scripts/master-init.sh @@ -23,4 +23,4 @@ if [[ "${ROLE}" == 'Master' ]]; then ./gnomad-init.sh > gnomad_startup.log 2>&1 & ./sparklyr-init.sh > sparklyr_startup.log 2>&1 & -fi +fi \ No newline at end of file diff --git a/init_scripts/sparklyr-init.sh b/init_scripts/sparklyr-init.sh index fae1f7472..1ded18d4e 100644 --- a/init_scripts/sparklyr-init.sh +++ b/init_scripts/sparklyr-init.sh @@ -24,4 +24,4 @@ ln -s /usr/lib/spark /opt/spark/spark-2.0.2-bin-hadoop2.7 R --vanilla -e "install.packages(c('sparklyr', 'dplyr'), repos='https://cran.rstudio.com')" R --vanilla -e "install.packages(c('magrittr', 'ggplot2', 'slackr', 'ggrepel'), repos='https://cran.rstudio.com')" R --vanilla -e "install.packages(c('plyr', 'shiny', 'plotly'), repos='https://cran.rstudio.com')" -R --vanilla -e "install.packages(c('DT', 'tidyverse', 'broom', 'randomForest', 'ROCR', 'shinythemes', 'devtools'), repos='https://cran.rstudio.com')" +R --vanilla -e "install.packages(c('DT', 'tidyverse', 'broom', 'randomForest', 'ROCR', 'shinythemes', 'devtools'), repos='https://cran.rstudio.com')" \ No newline at end of file diff --git a/init_scripts/vep81-init.sh b/init_scripts/vep81-init.sh index b05a529c5..96baac55a 100644 --- a/init_scripts/vep81-init.sh +++ b/init_scripts/vep81-init.sh @@ -35,4 +35,4 @@ gsutil cp gs://hail-common/vep/vep/1var.vcf /vep gsutil cp gs://hail-common/vep/vep/run_hail_vep_vcf.sh /vep chmod a+rx /vep/run_hail_vep_vcf.sh -/vep/run_hail_vep_vcf.sh /vep/1var.vcf +/vep/run_hail_vep_vcf.sh /vep/1var.vcf \ No newline at end of file diff --git a/init_scripts/vep85-init.sh b/init_scripts/vep85-init.sh index bfac0a35b..73d93bd31 100644 --- a/init_scripts/vep85-init.sh +++ b/init_scripts/vep85-init.sh @@ -39,4 +39,4 @@ gsutil cp gs://hail-common/vep/vep/1var.vcf /vep gsutil cp gs://hail-common/vep/vep/run_hail_vep_vcf.sh /vep chmod a+rx /vep/run_hail_vep_vcf.sh -/vep/run_hail_vep_vcf.sh /vep/1var.vcf +/vep/run_hail_vep_vcf.sh /vep/1var.vcf \ No newline at end of file From e12a52eb3abd736fa97513e6ca846de315220571 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 20 Dec 2024 15:09:20 -0500 Subject: [PATCH 08/14] undo changes --- init_scripts/vep81-init.sh | 2 +- init_scripts/vep85-init.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/init_scripts/vep81-init.sh b/init_scripts/vep81-init.sh index 96baac55a..586913350 100644 --- a/init_scripts/vep81-init.sh +++ b/init_scripts/vep81-init.sh @@ -35,4 +35,4 @@ gsutil cp gs://hail-common/vep/vep/1var.vcf /vep gsutil cp gs://hail-common/vep/vep/run_hail_vep_vcf.sh /vep chmod a+rx /vep/run_hail_vep_vcf.sh -/vep/run_hail_vep_vcf.sh /vep/1var.vcf \ No newline at end of file + /vep/run_hail_vep_vcf.sh /vep/1var.vcf \ No newline at end of file diff --git a/init_scripts/vep85-init.sh b/init_scripts/vep85-init.sh index 73d93bd31..a5da2066c 100644 --- a/init_scripts/vep85-init.sh +++ b/init_scripts/vep85-init.sh @@ -39,4 +39,4 @@ gsutil cp gs://hail-common/vep/vep/1var.vcf /vep gsutil cp gs://hail-common/vep/vep/run_hail_vep_vcf.sh /vep chmod a+rx /vep/run_hail_vep_vcf.sh -/vep/run_hail_vep_vcf.sh /vep/1var.vcf \ No newline at end of file + /vep/run_hail_vep_vcf.sh /vep/1var.vcf \ No newline at end of file From 11c11aaec76a4c16e2b5a55813ff41ce97e46581 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 20 Dec 2024 15:11:38 -0500 Subject: [PATCH 09/14] undo changes 2 --- init_scripts/vep81-init.sh | 2 +- init_scripts/vep85-init.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/init_scripts/vep81-init.sh b/init_scripts/vep81-init.sh index 586913350..96baac55a 100644 --- a/init_scripts/vep81-init.sh +++ b/init_scripts/vep81-init.sh @@ -35,4 +35,4 @@ gsutil cp gs://hail-common/vep/vep/1var.vcf /vep gsutil cp gs://hail-common/vep/vep/run_hail_vep_vcf.sh /vep chmod a+rx /vep/run_hail_vep_vcf.sh - /vep/run_hail_vep_vcf.sh /vep/1var.vcf \ No newline at end of file +/vep/run_hail_vep_vcf.sh /vep/1var.vcf \ No newline at end of file diff --git a/init_scripts/vep85-init.sh b/init_scripts/vep85-init.sh index a5da2066c..73d93bd31 100644 --- a/init_scripts/vep85-init.sh +++ b/init_scripts/vep85-init.sh @@ -39,4 +39,4 @@ gsutil cp gs://hail-common/vep/vep/1var.vcf /vep gsutil cp gs://hail-common/vep/vep/run_hail_vep_vcf.sh /vep chmod a+rx /vep/run_hail_vep_vcf.sh - /vep/run_hail_vep_vcf.sh /vep/1var.vcf \ No newline at end of file +/vep/run_hail_vep_vcf.sh /vep/1var.vcf \ No newline at end of file From 1e50c057b2bba8abe3774547d738801a2087df12 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 20 Dec 2024 15:50:00 -0500 Subject: [PATCH 10/14] Apply suggestions from code review Apply review suggestions Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/utils/filtering.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index ec9f60dae..565fa7458 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -407,8 +407,8 @@ def filter_to_gencode_cds( gencode_ht: Optional[hl.Table] = None, genes: Optional[Union[str, List[str]]] = None, by_gene_symbol: bool = True, - padding_bp: Optional[int] = 0, - max_intervals: Optional[int] = 3000, + padding_bp: int = 0, + max_collect_intervals: int = 3000, ) -> hl.Table: """ Filter a Table/MatrixTable to only Gencode CDS regions in protein coding transcripts. @@ -494,20 +494,18 @@ def filter_to_gencode_cds( ) ) - # Collect intervals if there are more than `max_intervals` to avoid memory issues - if gencode_ht.count() < max_intervals: + # Only collect intervals if there are less than or equal to `max_intervals` to avoid memory issues. + num_intervals = gencode_ht.count() + if num_intervals <= max_collect_intervals: logger.info( - "Since %d is smaller than the max intervals that can be collected, collecting all intervals...", - gencode_ht.count(), + "Since %d is less than or equal to 'max_collect_intervals', collecting all intervals...", + num_intervals, ) - cds_intervals = ( - gencode_ht.padded_interval if padding_bp else gencode_ht.interval - ) - cds_intervals = cds_intervals.collect() + cds_intervals = interval_expr.collect() t = hl.filter_intervals(t, cds_intervals) else: if padding_bp: - gencode_ht = gencode_ht.key_by("padded_interval") + gencode_ht = gencode_ht.key_by(padded_interval=interval_expr) t = ( t.filter_rows(hl.is_defined(gencode_ht[t.locus])) From e966735a681083b0e9ed7c19a97103e00a04a709 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 20 Dec 2024 15:58:17 -0500 Subject: [PATCH 11/14] Apply review suggestions --- gnomad/utils/filtering.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 565fa7458..057e17309 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -444,7 +444,12 @@ def filter_to_gencode_cds( :param genes: Optional gene(s) to filter to. Can be a single gene string or list of genes. Default is None. :param by_gene_symbol: Whether to filter by gene symbol. Default is True. If False, will filter by gene ID. :param padding_bp: Number of bases to pad the CDS intervals by. Default is 0. - :param max_intervals: Maximum number of intervals to collect before filtering. Default is 1000. + :param max_collect_intervals: Maximum number of intervals for the use of + `hl.filter_intervals` for filtering. When the number of intervals to filter is + greater than this number, `filter`/`filter_rows` will be used instead. The + reason for this is that`hl.filter_intervals` is faster, but when the + number of intervals is too large, this can cause memory errors. Default is + 3000. :return: Table/MatrixTable filtered to loci in Gencode CDS intervals. """ if gencode_ht is None: @@ -470,28 +475,22 @@ def filter_to_gencode_cds( " documentation for more details to confirm it's being used as intended." ) if genes: - genes_upper = ( + genes = hl.literal( [genes.upper()] if isinstance(genes, str) else [g.upper() for g in genes] ) if by_gene_symbol: - gencode_ht = gencode_ht.filter( - hl.literal(genes_upper).contains(gencode_ht.gene_name) - ) - else: - gencode_ht = gencode_ht.filter( - hl.literal(genes_upper).contains(gencode_ht.gene_id) - ) + gene_field = "gene_name" if by_gene_symbol else "gene_id" + gencode_ht = gencode_ht.filter(genes.contains(gencode_ht[gene_field])) + interval_expr = gencode_ht.interval if padding_bp: - gencode_ht = gencode_ht.annotate( - padded_interval=hl.locus_interval( - gencode_ht.interval.start.contig, - gencode_ht.interval.start.position - padding_bp, - gencode_ht.interval.end.position + padding_bp, - includes_start=gencode_ht.interval.includes_start, - includes_end=gencode_ht.interval.includes_end, - reference_genome=gencode_ht.interval.start.dtype.reference_genome, - ) + interval_expr = hl.locus_interval( + interval_expr.start.contig, + interval_expr.start.position - padding_bp, + interval_expr.end.position + padding_bp, + includes_start=interval_expr.includes_start, + includes_end=interval_expr.includes_end, + reference_genome=interval_expr.start.dtype.reference_genome, ) # Only collect intervals if there are less than or equal to `max_intervals` to avoid memory issues. From 0adb9528f41ec72b60ee6a99fb19066d3da68df1 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 20 Dec 2024 16:22:35 -0500 Subject: [PATCH 12/14] fix Unexpected indentation --- gnomad/utils/filtering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 057e17309..8f694f300 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -445,7 +445,7 @@ def filter_to_gencode_cds( :param by_gene_symbol: Whether to filter by gene symbol. Default is True. If False, will filter by gene ID. :param padding_bp: Number of bases to pad the CDS intervals by. Default is 0. :param max_collect_intervals: Maximum number of intervals for the use of - `hl.filter_intervals` for filtering. When the number of intervals to filter is + `hl.filter_intervals` for filtering. When the number of intervals to filter is greater than this number, `filter`/`filter_rows` will be used instead. The reason for this is that`hl.filter_intervals` is faster, but when the number of intervals is too large, this can cause memory errors. Default is From 6b740c241d05ab95000cc3df4ec4aa34ac924264 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 20 Dec 2024 16:27:44 -0500 Subject: [PATCH 13/14] formatting --- gnomad/utils/filtering.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 8f694f300..bc0fcd890 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -493,7 +493,8 @@ def filter_to_gencode_cds( reference_genome=interval_expr.start.dtype.reference_genome, ) - # Only collect intervals if there are less than or equal to `max_intervals` to avoid memory issues. + # Only collect intervals if there are less than or equal to + # `max_intervals` to avoid memory issues. num_intervals = gencode_ht.count() if num_intervals <= max_collect_intervals: logger.info( From 2657f5e52857e1235be36ec844f896c1dd678657 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 20 Dec 2024 16:35:05 -0500 Subject: [PATCH 14/14] Apply suggestions from code review Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/utils/filtering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index bc0fcd890..787d6d62b 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -447,7 +447,7 @@ def filter_to_gencode_cds( :param max_collect_intervals: Maximum number of intervals for the use of `hl.filter_intervals` for filtering. When the number of intervals to filter is greater than this number, `filter`/`filter_rows` will be used instead. The - reason for this is that`hl.filter_intervals` is faster, but when the + reason for this is that `hl.filter_intervals` is faster, but when the number of intervals is too large, this can cause memory errors. Default is 3000. :return: Table/MatrixTable filtered to loci in Gencode CDS intervals. @@ -494,7 +494,7 @@ def filter_to_gencode_cds( ) # Only collect intervals if there are less than or equal to - # `max_intervals` to avoid memory issues. + # `max_collect_intervals` to avoid memory issues. num_intervals = gencode_ht.count() if num_intervals <= max_collect_intervals: logger.info(