Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

grpmax vs European AF split nonsynonymous variant analysis #597

Merged
merged 18 commits into from
Jun 20, 2024
Merged
Changes from 10 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
294 changes: 294 additions & 0 deletions gnomad_qc/v4/analyses/grpmax_comps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
"""Find grpmax stats for gnomAD v4 and v2."""

import argparse
import logging
from pprint import pprint
from typing import Dict

import hail as hl
from gnomad.utils.vep import (
CSQ_CODING_HIGH_IMPACT,
CSQ_CODING_MEDIUM_IMPACT,
filter_vep_to_canonical_transcripts,
)
from tabulate import tabulate

from gnomad_qc.v2.resources.basics import get_gnomad_public_data
from gnomad_qc.v4.resources.release import release_sites

logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger("grpmax_comps")
logger.setLevel(logging.INFO)


DIVERSE_GRPS = hl.literal({"afr", "amr", "eas", "mid", "sas"})
EUR_GRPS = {"all_eur": ["nfe", "fin", "asj"], "nfe_only": ["nfe"]}
FILTER_VALUES_TO_DROP = hl.array(["AC0", "InbreedingCoeff"])
AF_THRESHOLDS = [0.0001, 0.001]
NS_CONSEQ_TERMS = hl.literal(CSQ_CODING_HIGH_IMPACT + CSQ_CODING_MEDIUM_IMPACT)


def get_eur_freq(ht: hl.Table, eur_grps: list, version: str = "v4"):
"""
Calculate European genetic ancestry super group AF.

:param ht: gnomAD release Table
:param eur_grps: Set of genetic ancestry groups to be included in AF
:param version: gnomAD version
:return: Table with European AF annotation
"""
if version == "v4":
eur_indices = hl.literal(eur_grps).map(lambda g: ht.freq_index_dict[g + "_adj"])
else:
eur_indices = hl.literal(eur_grps).map(
lambda g: ht.freq_index_dict["gnomad_" + g]
)
ht = ht.annotate(
eur_AC=hl.sum(
eur_indices.map(
lambda f: hl.if_else(hl.is_defined(ht.freq[f].AC), ht.freq[f].AC, 0)
)
),
eur_AN=hl.sum(
eur_indices.map(
lambda f: hl.if_else(hl.is_defined(ht.freq[f].AN), ht.freq[f].AN, 0)
)
),
)
ht = ht.annotate(eur_AF=hl.if_else(ht.eur_AN > 0, ht.eur_AC / ht.eur_AN, 0.0))
return ht.drop("eur_AC", "eur_AN")


def filter_to_threshold(
ht: hl.Table,
af_threshold: float = 0.001,
version: str = "v4",
):
"""
Filter to variants where eur AF is < threshold while grpmax > threshold.

:param ht: gnomAD release Table
:param af_threshold: AF dividing threshold
:param version: gnomAD version
:return: Table filtered to variants meething threshold specifications
"""
if version == "v4":
grpmax_expr = ht.grpmax.gnomad
else:
grpmax_expr = ht.popmax[0]
ht = ht.filter((grpmax_expr.AF > af_threshold) & (ht.eur_AF < af_threshold))
return ht


def version_stats(
ht: hl.Table,
version: str = "v4",
can_only: bool = False,
drop_hard_filtered_variants: bool = False,
all_variants: bool = False,
) -> Dict[str, Dict[str, Dict[str, int]]]:
"""
Calculate grpmax stats for a given gnomAD version.

:param ht: gnomAD release Table
:param version: gnomAD version
:param can_only: Only consider MANE Select and canonical transcripts.
mike-w-wilson marked this conversation as resolved.
Show resolved Hide resolved
:param drop_hard_filtered_variants: Remove only hard filtered variants but keep all other variants regardless of variant QC status
:param all_variants: Keep all variants regardless of variant QC status
:return: Dictionary of grpmax stats
"""
logger.info("Calculating grpmax stats for %s", version)
# Group results into nfe and all eur grps
results_by_eur_grping = {}

logger.info(
"Total number of variants in %s before filtering: %s", version, ht.count()
)
t_variants = ht.count()
if can_only:
logger.info("Filtering to only MANE Select and canonical transcripts")
ht = filter_vep_to_canonical_transcripts(ht, filter_empty_csq=True)
# All MANE select transcripts in v4 are also the canonical transcript
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor comment that this comment makes a little more sense above the call to filter_vep_to_canonical_transcripts

logger.info("Filtering to keep only non-synonymous variants...")
ht = ht.filter(
hl.any(
ht.vep.transcript_consequences.consequence_terms.map(
lambda x: hl.literal(NS_CONSEQ_TERMS).contains(x)
Copy link
Contributor

@ch-kr ch-kr May 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re-reading this and realizing maybe it's better to use filter_vep_transcript_csqs to simultaneously filter to canonical transcripts and to non-synonymous variants

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I restructured this and actually tried using this function but hit a bug: broadinstitute/gnomad_methods#707

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

)
)
)
ht = ht.checkpoint(
f"gs://gnomad-tmp-4day/grpmax_comps_{version}_canonical_non_syn.ht",
overwrite=True,
)
ns_variants = ht.count()
logger.info(
"Total number of variants in %s after canonical/mane and consequence "
"filtering: %s (%.2f%% of total variants)",
version,
ns_variants,
(ns_variants / t_variants) * 100,
)

else:
# Filter to only non-synonymous terms CSQ_CODING_HIGH_IMPACT +
# CSQ_CODING_MEDIUM_IMPACT
logger.info(
"Filtering on most_severe_consequence to keep only non-synonymous"
" variants..."
)
ht = ht.filter(
hl.literal(NS_CONSEQ_TERMS).contains(ht.vep.most_severe_consequence)
)
ht = ht.checkpoint(
f"gs://gnomad-tmp-4day/grpmax_comps_{version}_non_syn.ht", overwrite=True
)
ns_variants = ht.count()
logger.info(
"Total number of variants in %s after consequence filtering: %s (%.2f%% of "
"total variants)",
version,
ns_variants,
(ns_variants / t_variants) * 100,
)
if all_variants:
log_message = "of variants, regardless of variant QC status"
elif drop_hard_filtered_variants:
# Filter out AC0 and InbreedingCoeff variants
ht = ht.filter(~ht.filters.any(lambda x: FILTER_VALUES_TO_DROP.contains(x)))
log_message = "of variants passing ONLY hard filters (RF/VQSR are retained)"
else:
# Filter to only PASS
ht = ht.filter(hl.len(ht.filters) == 0)
log_message = "of variants passing all filters"

p_ns_variants = ht.count()
logger.info(
"Total number %s: %s (%.2f%% of"
" non-synonymous variants, %.2f%% of total variants)",
log_message,
p_ns_variants,
(p_ns_variants / ns_variants) * 100,
(p_ns_variants / t_variants) * 100,
)
# Iterate through just nfe group and all eur grp for calculating eur AF
for grp_id, grps in EUR_GRPS.items():
# Get european frequency by calculating the cumulative AF in the passed
# europe genetic ancestry list
p_ht = get_eur_freq(ht, eur_grps=grps, version=version)

grpmax_by_thresholds = {}
for threshold in AF_THRESHOLDS:
# Filter to keep only variants that split at the AF threshold
t_ht = filter_to_threshold(p_ht, threshold, version=version)
t_ht = t_ht.checkpoint(
f"gs://gnomad-tmp-4day/grpmax_comps_{version}_{grp_id}_{threshold}.ht",
overwrite=True,
)
logger.info(
"Number of variants after filtering to AF threshold for %s at %s "
"threshold in %s: %s ( %.2f%%"
" of total variants)",
grp_id,
threshold,
version,
t_ht.count(),
# t_ht.count() / p_ns_variants * 100,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# t_ht.count() / p_ns_variants * 100,

t_ht.count() / t_variants * 100,
)
if version == "v4":
t_ht = t_ht.annotate(grpmax_ga=t_ht.grpmax.gnomad.gen_anc)
else:
t_ht = t_ht.annotate(grpmax_ga=t_ht.popmax[0].pop)

t_ht = t_ht.filter(DIVERSE_GRPS.contains(t_ht.grpmax_ga))

# For each diverse genetic ancestry group, aggregate the number of
# variants where that group is grpmax
grpmax_by_thresholds[threshold] = t_ht.aggregate(
hl.agg.counter(t_ht.grpmax_ga)
)
# Add a total count for all groups
grpmax_by_thresholds[threshold]["all"] = t_ht.count()

results_by_eur_grping[grp_id] = grpmax_by_thresholds
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
logger.info(f"Results for {version}")
pprint(results_by_eur_grping, sort_dicts=False)
return results_by_eur_grping


def create_table(
version_dict: Dict[str, Dict[str, Dict[str, int]]],
data_subset: str,
):
"""
Create tables of grpmax stats.

Tables have a column for each version as well as the difference for each grpmax group.

:param version_dict: Dictionary of grpmax stats
:param data_subset: Data subset
"""
headers = ["grpmax_grp", "v2", "v4", "difference(v4-v2)"]
for threshold in AF_THRESHOLDS:
table = []
for grpmax_group in hl.eval(hl.array(DIVERSE_GRPS)) + ["all"]:
v2_val = version_dict["v2"][data_subset][threshold].get(grpmax_group, 0)
v4_val = version_dict["v4"][data_subset][threshold].get(grpmax_group, 0)
diff = v4_val - v2_val
if diff < 0:
diff = f"\033[91m{diff}\033[0m"
else:
diff = f"\033[92m{diff}\033[0m"
table.append([grpmax_group, v2_val, v4_val, diff])
logger.info(
"\nNonsynonymous variant count by grpmax genetic ancestry group where the\n"
f"grpmax AF is above {threshold} and the {data_subset} AF is below it...\n"
f"{tabulate(table,headers=headers,tablefmt='fancy_grid')}"
)


def main(args):
"""Find grpmax stats for gnomAD v4 and v2."""
version_dict = {}
for version in ["v4", "v2"]:
if version == "v4":
ht = release_sites().ht()
else:
ht = get_gnomad_public_data("exomes")
version_dict[version] = version_stats(
ht,
version=version,
can_only=args.canonical_only,
drop_hard_filtered_variants=args.drop_hard_filtered_variants,
all_variants=args.all_variants,
)

# Create tables for "all_eur" and "nfe_only" data
create_table(version_dict, data_subset="all_eur")
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
create_table(version_dict, data_subset="nfe_only")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--canonical-only",
action="store_true",
help="Only consider MANE Select and canonical transcripts",
)
parser.add_argument(
"--drop-hard-filtered-variants",
action="store_true",
help="Drop variants that have been hard filtered",
)
parser.add_argument(
"--all-variants",
action="store_true",
help="Keep all variants regardless of variant QC status",
)
args = parser.parse_args()
if args.drop_hard_filtered_variants and args.all_variants:
raise ValueError(
"Cannot use --drop-hard-filtered-variants and --all-variants together"
)
main(args)
Loading