Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

grpmax vs European AF split nonsynonymous variant analysis #597

Merged
merged 18 commits into from
Jun 20, 2024
Merged
Changes from 4 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 41 additions & 39 deletions gnomad_qc/v4/analyses/grpmax_comps.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@
import hail as hl
from gnomad.utils.vep import (
CSQ_CODING_HIGH_IMPACT,
CSQ_CODING_LOW_IMPACT,
CSQ_CODING_MEDIUM_IMPACT,
CSQ_NON_CODING,
filter_vep_to_canonical_transcripts,
)
from tabulate import tabulate

Expand Down Expand Up @@ -82,14 +81,20 @@ def filter_to_threshold(


def version_stats(
ht: hl.Table, version: str = "v4", can_only: bool = False
ht: hl.Table,
version: str = "v4",
can_only: bool = False,
drop_hard_filtered_variants: bool = False,
all_variants: bool = False,
) -> Dict[str, Dict[str, Dict[str, int]]]:
"""
Calculate grpmax stats for a given gnomAD version.

:param ht: gnomAD release Table
:param version: gnomAD version
:param can_only: Only consider MANE Select and canonical transcripts.
mike-w-wilson marked this conversation as resolved.
Show resolved Hide resolved
:param drop_hard_filtered_variants: Remove only hard filtered variants but keep all other variants regardless of variant QC status
:param all_variants: Keep all variants regardless of variant QC status
:return: Dictionary of grpmax stats
"""
logger.info("Calculating grpmax stats for %s", version)
Expand All @@ -102,30 +107,16 @@ def version_stats(
t_variants = ht.count()
if can_only:
logger.info("Filtering to only MANE Select and canonical transcripts")
ht = ht.explode(ht.vep.transcript_consequences)
if version == "v2":
ht = ht.filter(hl.is_defined(ht.vep.transcript_consequences.canonical))
else:
ht = ht.filter(
hl.is_defined(ht.vep.transcript_consequences.canonical)
| hl.is_defined(ht.vep.transcript_consequences.mane_select)
)
logger.info(
"Filtering on mane_select and canonical transcript consequence term to "
"keep only non-synonymous variants..."
)
ht = filter_vep_to_canonical_transcripts(ht, filter_empty_csq=True)
# All MANE select transcripts in v4 are also the canonical transcript
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor comment that this comment makes a little more sense above the call to filter_vep_to_canonical_transcripts

logger.info("Filtering to keep only non-synonymous variants...")
ht = ht.filter(
hl.any(
ht.vep.transcript_consequences.consequence_terms.map(
lambda x: ~hl.literal(
CSQ_NON_CODING + CSQ_CODING_LOW_IMPACT
).contains(x)
lambda x: hl.literal(NS_CONSEQ_TERMS).contains(x)
Copy link
Contributor

@ch-kr ch-kr May 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re-reading this and realizing maybe it's better to use filter_vep_transcript_csqs to simultaneously filter to canonical transcripts and to non-synonymous variants

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I restructured this and actually tried using this function but hit a bug: broadinstitute/gnomad_methods#707

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

)
)
)
# Remove duplicate sites after exploding, most_severe_consequence is the same for all
# transcript consequences so we can use distinct to remove duplicates
ht = ht.distinct()
ht = ht.checkpoint(
f"gs://gnomad-tmp-4day/grpmax_comps_{version}_canonical_non_syn.ht",
overwrite=True,
Expand All @@ -147,9 +138,7 @@ def version_stats(
" variants..."
)
ht = ht.filter(
~hl.literal(CSQ_NON_CODING + CSQ_CODING_LOW_IMPACT).contains(
ht.vep.most_severe_consequence
)
hl.literal(NS_CONSEQ_TERMS).contains(ht.vep.most_severe_consequence)
)
ht = ht.checkpoint(
f"gs://gnomad-tmp-4day/grpmax_comps_{version}_non_syn.ht", overwrite=True
Expand All @@ -162,17 +151,20 @@ def version_stats(
ns_variants,
(ns_variants / t_variants) * 100,
)
if args.drop_hard_filtered_variants:
if all_variants:
log_message = "of variants, regardless of variant QC status"
elif drop_hard_filtered_variants:
# Filter out AC0 and InbreedingCoeff variants
ht = ht.filter(~ht.filters.any(lambda x: FILTER_VALUES_TO_DROP.contains(x)))
log_message = "ONLY hard filters (RF/VQSR are retained)"
log_message = "of variants passing ONLY hard filters (RF/VQSR are retained)"
else:
# Filter to only PASS
ht = ht.filter(hl.len(ht.filters) == 0)
log_message = "all filters"
log_message = "of variants passing all filters"

p_ns_variants = ht.count()
logger.info(
"Total number of variants passing %s: %s (%.2f%% of"
"Total number %s: %s (%.2f%% of"
" non-synonymous variants, %.2f%% of total variants)",
log_message,
p_ns_variants,
Expand All @@ -195,29 +187,26 @@ def version_stats(
)
logger.info(
"Number of variants after filtering to AF threshold for %s at %s "
"threshold in %s: %s (%.2f%% of passing non-synonymous variants, %.2f%%"
"threshold in %s: %s ( %.2f%%"
" of total variants)",
grp_id,
threshold,
version,
t_ht.count(),
t_ht.count() / p_ns_variants * 100,
# t_ht.count() / p_ns_variants * 100,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# t_ht.count() / p_ns_variants * 100,

t_ht.count() / t_variants * 100,
)
if version == "v4":
grpmax_ga_expr = t_ht.grpmax.gnomad.gen_anc
t_ht = t_ht.annotate(grpmax_ga=t_ht.grpmax.gnomad.gen_anc)
else:
grpmax_ga_expr = t_ht.popmax[0].pop
t_ht = t_ht.annotate(grpmax_ga=t_ht.popmax[0].pop)

t_ht = t_ht.filter(DIVERSE_GRPS.contains(t_ht.grpmax_ga))

t_ht = t_ht.filter(DIVERSE_GRPS.contains(grpmax_ga_expr))
if version == "v4":
grpmax_ga_expr = t_ht.grpmax.gnomad.gen_anc
else:
grpmax_ga_expr = t_ht.popmax[0].pop
# For each diverse genetic ancestry group, aggregate the number of
# variants where that group is grpmax
grpmax_by_thresholds[threshold] = t_ht.aggregate(
hl.agg.counter(grpmax_ga_expr)
hl.agg.counter(t_ht.grpmax_ga)
)
# Add a total count for all groups
grpmax_by_thresholds[threshold]["all"] = t_ht.count()
Expand Down Expand Up @@ -268,7 +257,11 @@ def main(args):
else:
ht = get_gnomad_public_data("exomes")
version_dict[version] = version_stats(
ht, version=version, can_only=args.canonical_only
ht,
version=version,
can_only=args.canonical_only,
drop_hard_filtered_variants=args.drop_hard_filtered_variants,
all_variants=args.all_variants,
)

# Create tables for "all_eur" and "nfe_only" data
Expand All @@ -288,5 +281,14 @@ def main(args):
action="store_true",
help="Drop variants that have been hard filtered",
)
parser.add_argument(
"--all-variants",
action="store_true",
help="Keep all variants regardless of variant QC status",
)
args = parser.parse_args()
if args.drop_hard_filtered_variants and args.all_variants:
raise ValueError(
"Cannot use --drop-hard-filtered-variants and --all-variants together"
)
main(args)
Loading