Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter to Gencode CDS by genes and by exon paddings #747

Merged
merged 14 commits into from
Dec 20, 2024
58 changes: 53 additions & 5 deletions gnomad/utils/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,12 @@ def filter_to_clinvar_pathogenic(


def filter_to_gencode_cds(
t: Union[hl.MatrixTable, hl.Table], gencode_ht: Optional[hl.Table] = None
t: Union[hl.MatrixTable, hl.Table],
gencode_ht: Optional[hl.Table] = None,
genes: Optional[Union[str, List[str]]] = None,
by_gene_symbol: bool = True,
padding_bp: Optional[int] = 0,
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
max_intervals: Optional[int] = 3000,
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
) -> hl.Table:
"""
Filter a Table/MatrixTable to only Gencode CDS regions in protein coding transcripts.
Expand Down Expand Up @@ -436,6 +441,10 @@ def filter_to_gencode_cds(
:param gencode_ht: Gencode Table to use for filtering the input Table/MatrixTable
to CDS regions. Default is None, which will use the default version of the
Gencode Table resource.
:param genes: Optional gene(s) to filter to. Can be a single gene string or list of genes. Default is None.
:param by_gene_symbol: Whether to filter by gene symbol. Default is True. If False, will filter by gene ID.
:param padding_bp: Number of bases to pad the CDS intervals by. Default is 0.
:param max_intervals: Maximum number of intervals to collect before filtering. Default is 1000.
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
:return: Table/MatrixTable filtered to loci in Gencode CDS intervals.
"""
if gencode_ht is None:
Expand All @@ -460,12 +469,51 @@ def filter_to_gencode_cds(
"This Gencode CDS interval filter does not filter by transcript! Please see the"
" documentation for more details to confirm it's being used as intended."
)
filter_expr = hl.is_defined(gencode_ht[t.locus])
if genes:
genes_upper = (
[genes.upper()] if isinstance(genes, str) else [g.upper() for g in genes]
)
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
if by_gene_symbol:
gencode_ht = gencode_ht.filter(
hl.literal(genes_upper).contains(gencode_ht.gene_name)
)
else:
gencode_ht = gencode_ht.filter(
hl.literal(genes_upper).contains(gencode_ht.gene_id)
)
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved

if isinstance(t, hl.MatrixTable):
t = t.filter_rows(filter_expr)
if padding_bp:
gencode_ht = gencode_ht.annotate(
padded_interval=hl.locus_interval(
gencode_ht.interval.start.contig,
gencode_ht.interval.start.position - padding_bp,
gencode_ht.interval.end.position + padding_bp,
includes_start=gencode_ht.interval.includes_start,
includes_end=gencode_ht.interval.includes_end,
reference_genome=gencode_ht.interval.start.dtype.reference_genome,
)
)
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved

# Collect intervals if there are more than `max_intervals` to avoid memory issues
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
if gencode_ht.count() < max_intervals:
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
logger.info(
"Since %d is smaller than the max intervals that can be collected, collecting all intervals...",
gencode_ht.count(),
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
)
cds_intervals = (
gencode_ht.padded_interval if padding_bp else gencode_ht.interval
)
cds_intervals = cds_intervals.collect()
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
t = hl.filter_intervals(t, cds_intervals)
else:
t = t.filter(filter_expr)
if padding_bp:
gencode_ht = gencode_ht.key_by("padded_interval")
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved

t = (
t.filter_rows(hl.is_defined(gencode_ht[t.locus]))
if isinstance(t, hl.MatrixTable)
else t.filter(hl.is_defined(gencode_ht[t.locus]))
)

return t

Expand Down
6 changes: 2 additions & 4 deletions init_scripts/vep81-init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


# Copy VEP
mkdir -p /vep/homo_sapiens
mkdir -p /vep/homo_sapiens
gsutil -m cp -r gs://hail-common/vep/vep/loftee /vep
gsutil -m cp -r gs://hail-common/vep/vep/ensembl-tools-release-81 /vep
gsutil -m cp -r gs://hail-common/vep/vep/loftee_data /vep
Expand Down Expand Up @@ -35,6 +35,4 @@ gsutil cp gs://hail-common/vep/vep/1var.vcf /vep
gsutil cp gs://hail-common/vep/vep/run_hail_vep_vcf.sh /vep
chmod a+rx /vep/run_hail_vep_vcf.sh

/vep/run_hail_vep_vcf.sh /vep/1var.vcf


/vep/run_hail_vep_vcf.sh /vep/1var.vcf
8 changes: 2 additions & 6 deletions init_scripts/vep85-init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


# Copy VEP
mkdir -p /vep/homo_sapiens
mkdir -p /vep/homo_sapiens
gsutil -m cp -r gs://hail-common/vep/vep/loftee /vep
gsutil -m cp -r gs://hail-common/vep/vep/ensembl-tools-release-85 /vep
gsutil -m cp -r gs://hail-common/vep/vep/loftee_data /vep
Expand Down Expand Up @@ -39,8 +39,4 @@ gsutil cp gs://hail-common/vep/vep/1var.vcf /vep
gsutil cp gs://hail-common/vep/vep/run_hail_vep_vcf.sh /vep
chmod a+rx /vep/run_hail_vep_vcf.sh

/vep/run_hail_vep_vcf.sh /vep/1var.vcf




/vep/run_hail_vep_vcf.sh /vep/1var.vcf
Loading