Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

grpmax vs European AF split nonsynonymous variant analysis #597

Merged
merged 18 commits into from
Jun 20, 2024
Merged
Changes from 3 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 175 additions & 0 deletions gnomad_qc/v4/analyses/grpmax_comps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
"""Find grpmax stats for gnomAD v4 and v2."""

import logging
from pprint import pprint
from typing import Dict

import hail as hl
from gnomad.utils.vep import CSQ_CODING_HIGH_IMPACT, CSQ_CODING_MEDIUM_IMPACT
from tabulate import tabulate

from gnomad_qc.v2.resources.basics import get_gnomad_public_data
from gnomad_qc.v4.resources.release import release_sites

logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger("grpmax_comps")
logger.setLevel(logging.INFO)


DIVERSE_GRPS = hl.literal({"afr", "amr", "eas", "mid", "sas"})
EUR_GRPS = {"all_eur": ["nfe", "fin", "asj"], "nfe_only": ["nfe"]}

AF_THRESHOLDS = [0.0001, 0.001]
NS_CONSEQ_TERMS = hl.literal(CSQ_CODING_HIGH_IMPACT + CSQ_CODING_MEDIUM_IMPACT)


def get_eur_freq(ht: hl.Table, eur_grps: list, version: str = "v4"):
"""
Calculate European genetic ancestry super group AF.

:param ht: gnomAD release Table
:param eur_grps: Set of genetic ancestry groups to be included in AF
:param version: gnomAD version
:return: Table with European AF annotation
"""
if version == "v4":
eur_indices = hl.literal(eur_grps).map(lambda g: ht.freq_index_dict[g + "_adj"])
else:
eur_indices = hl.literal(eur_grps).map(
lambda g: ht.freq_index_dict["gnomad_" + g]
)
ht = ht.annotate(
eur_AF=hl.sum(
eur_indices.map(
lambda f: hl.if_else(hl.is_defined(ht.freq[f].AC), ht.freq[f].AC, 0)
)
)
/ hl.sum(
eur_indices.map(
lambda f: hl.if_else(hl.is_defined(ht.freq[f].AN), ht.freq[f].AN, 0)
)
)
)
ht = ht.filter(hl.is_defined(ht.eur_AF))
mike-w-wilson marked this conversation as resolved.
Show resolved Hide resolved

return ht


def filter_to_threshold(
ht: hl.Table,
af_threshold: float = 0.001,
version: str = "v4",
):
"""
Filter to variants where eur AF is < threshold while grpmax > threshold.

:param ht: gnomAD release Table
:param af_threshold: AF dividing threshold
:param version: gnomAD version
:return: Table filtered to variants meething threshold specifications
"""
if version == "v4":
grpmax_expr = ht.grpmax.gnomad
else:
grpmax_expr = ht.popmax[0]
ht = ht.filter((grpmax_expr.AF > af_threshold) & (ht.eur_AF < af_threshold))
return ht


def version_stats(
ht: hl.Table, version: str = "v4"
) -> Dict[str, Dict[str, Dict[str, int]]]:
"""
Calculate grpmax stats for a given gnomAD version.

:param ht: gnomAD release Table
:param version: gnomAD version
:return: Dictionary of grpmax stats
"""
logger.info(f"Calculating grpmax stats for {version}")
# Group results into nfe and all eur grps
results_by_eur_grping = {}
# Filter to only non-synonymous terms CSQ_CODING_HIGH_IMPACT +
# CSQ_CODING_MEDIUM_IMPACT
ht = ht.filter(NS_CONSEQ_TERMS.contains(ht.vep.most_severe_consequence))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added some print statements to check these filters:

v4 HT:
- Total count = 183717261
- Filtering to non-synonymous keeps 23786562 variants (~13% of all variants)
- Filtering to variant QC pass keeps 16319240 variants (~69% of all non-synonymous variants)

v2 HT:
- Total count = 17209972
- Non-syn = 7849006 (~46%)
- QC pass = 6923424 (~88% non-syn)

The proportion of variants retained in v2 is much higher, which makes me wonder: is there something off about the most_severe_consequence field in the v4 HT? Maybe a helpful debugging step is to rerun after filtering v4 to MANE/canonical transcripts only and v2 to canonical only?

# Filter to PASS variants only
ht = ht.filter(ht.filters.length() == 0)
# Iterate through just nfe group and all eur grp for calculating eur AF
for grp_id, grps in EUR_GRPS.items():
# Get european frequency by calculating the cumulative AF in the passed
# europe genetic ancestry list
p_ht = get_eur_freq(ht, eur_grps=grps, version=version)

grpmax_by_thresholds = {}
for threshold in AF_THRESHOLDS:
# Filter to keep only variants that split at the AF threshold
t_ht = filter_to_threshold(p_ht, threshold, version=version)
if version == "v4":
grpmax_ga_expr = t_ht.grpmax.gnomad.gen_anc
else:
grpmax_ga_expr = t_ht.popmax[0].pop

t_ht = t_ht.filter(DIVERSE_GRPS.contains(grpmax_ga_expr))
if version == "v4":
grpmax_ga_expr = t_ht.grpmax.gnomad.gen_anc
else:
grpmax_ga_expr = t_ht.popmax[0].pop
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you need this twice?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sadly yes because the filter above changes the table so the expr assignment in L108 and L110 points to the old table and breaks hail. There may be another way that my Friday brain isnt thinking of though...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh yeah I totally forgot -- good call. I guess the alternative is probably something like

            if version == "v4":
                t_ht = t_ht.annotate(grpmax_ga=t_ht.grpmax.gnomad.gen_anc)
            else:
                t_ht = t_ht.annotate(grpmax_ga=t_ht.popmax[0].pop)

            t_ht = t_ht.filter(DIVERSE_GRPS.contains(t_ht.grpmax_ga))

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes -- this is better

# For each diverse genetic ancestry group, aggregate the number of
# variants where that group is grpmax
grpmax_by_thresholds[threshold] = t_ht.aggregate(
hl.agg.counter(grpmax_ga_expr)
)
results_by_eur_grping[grp_id] = grpmax_by_thresholds
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
logger.info(f"Results for {version}")
pprint(results_by_eur_grping, sort_dicts=False)
return results_by_eur_grping


def create_table(
version_dict: Dict[str, Dict[str, Dict[str, int]]],
data_subset: str,
):
"""
Create tables of grpmax stats.

Tables have a column for each version as well as the difference for each grpmax group.

:param version_dict: Dictionary of grpmax stats
:param data_subset: Data subset
"""
headers = ["grpmax_grp", "v2", "v4", "difference(v4-v2)"]
for threshold in AF_THRESHOLDS:
table = []
for grpmax_group in hl.eval(DIVERSE_GRPS):
v2_val = version_dict["v2"][data_subset][threshold].get(grpmax_group, 0)
v4_val = version_dict["v4"][data_subset][threshold].get(grpmax_group, 0)
diff = v4_val - v2_val
if diff < 0:
diff = f"\033[91m{diff}\033[0m"
else:
diff = f"\033[92m{diff}\033[0m"
table.append([grpmax_group, v2_val, v4_val, diff])
logger.info(
"\nNonsynonymous variant count by grpmax genetic ancestry group where the\n"
f"grpmax AF is above {threshold} and the {data_subset} AF is below it...\n"
f"{tabulate(table,headers=headers,tablefmt='fancy_grid')}"
)


def main():
"""Find grpmax stats for gnomAD v4 and v2."""
version_dict = {}
for version in ["v4", "v2"]:
if version == "v4":
ht = release_sites().ht()
else:
ht = get_gnomad_public_data("exomes")
version_dict[version] = version_stats(ht, version=version)

# Create tables for "all_eur" and "nfe_only" data
create_table(version_dict, data_subset="all_eur")
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
create_table(version_dict, data_subset="nfe_only")


if __name__ == "__main__":
main()
Loading