-
Notifications
You must be signed in to change notification settings - Fork 27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
grpmax vs European AF split nonsynonymous variant analysis #597
Changes from 5 commits
ee3f983
2d1494c
766d541
398f282
cef0eba
622ddc4
2389ca9
d4ccfa5
63dae9d
f572821
403df5f
093f774
b3f4b28
60c0510
2fc6627
466c25a
8ff42ee
c831c42
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -7,14 +7,18 @@ | |||||
from typing import Dict | ||||||
|
||||||
import hail as hl | ||||||
from gnomad.resources.grch38.reference_data import vep_context | ||||||
from gnomad.utils.vep import ( | ||||||
CSQ_CODING_HIGH_IMPACT, | ||||||
CSQ_CODING_MEDIUM_IMPACT, | ||||||
filter_vep_to_canonical_transcripts, | ||||||
process_consequences, | ||||||
) | ||||||
from tabulate import tabulate | ||||||
|
||||||
from gnomad_qc.v2.resources.basics import get_gnomad_public_data | ||||||
from gnomad_qc.v2.resources.basics import ( | ||||||
get_gnomad_liftover_data_path, | ||||||
get_gnomad_public_data, | ||||||
) | ||||||
from gnomad_qc.v4.resources.release import release_sites | ||||||
|
||||||
logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") | ||||||
|
@@ -25,7 +29,7 @@ | |||||
DIVERSE_GRPS = ["afr", "amr", "eas", "mid", "sas"] | ||||||
EUR_GRPS = {"all_eur": ["nfe", "fin", "asj"], "nfe_only": ["nfe"]} | ||||||
FILTER_VALUES_TO_DROP = hl.array(["AC0", "InbreedingCoeff"]) | ||||||
AF_THRESHOLDS = [0.0001, 0.001] | ||||||
AF_THRESHOLDS = [0.0001, 0.001, 0.01] | ||||||
NS_CONSEQ_TERMS = CSQ_CODING_HIGH_IMPACT + CSQ_CODING_MEDIUM_IMPACT | ||||||
|
||||||
|
||||||
|
@@ -61,9 +65,7 @@ def get_eur_freq(ht: hl.Table, eur_grps: list, version: str = "v4"): | |||||
|
||||||
|
||||||
def filter_to_threshold( | ||||||
ht: hl.Table, | ||||||
af_threshold: float = 0.001, | ||||||
version: str = "v4", | ||||||
ht: hl.Table, af_threshold: float = 0.001, version: str = "v4", eur_filter=False | ||||||
): | ||||||
""" | ||||||
Filter to variants where eur AF is < threshold while grpmax > threshold. | ||||||
|
@@ -77,7 +79,10 @@ def filter_to_threshold( | |||||
grpmax_expr = ht.grpmax.gnomad | ||||||
else: | ||||||
grpmax_expr = ht.popmax[0] | ||||||
ht = ht.filter((grpmax_expr.AF > af_threshold) & (ht.eur_AF < af_threshold)) | ||||||
filter_expr = grpmax_expr.AF > af_threshold | ||||||
if eur_filter: | ||||||
filter_expr &= ht.eur_AF < af_threshold | ||||||
ht = ht.filter(filter_expr) | ||||||
return ht | ||||||
|
||||||
|
||||||
|
@@ -86,6 +91,7 @@ def version_stats( | |||||
t_variants: int, | ||||||
version: str = "v4", | ||||||
grpmax_counts: bool = False, | ||||||
eur_filter: bool = False, | ||||||
) -> Dict[str, Dict[str, Dict[str, int]]]: | ||||||
""" | ||||||
Calculate grpmax stats for a given gnomAD version. | ||||||
|
@@ -99,48 +105,73 @@ def version_stats( | |||||
results_dict = {} | ||||||
|
||||||
if grpmax_counts: | ||||||
logger.info( | ||||||
"Calculating grpmax counts for diverse genetic ancestry groups to " | ||||||
"deduplicate variant counts..." | ||||||
) | ||||||
# Iterate through just nfe group and all eur grp for calculating eur AF | ||||||
for grp_id, grps in EUR_GRPS.items(): | ||||||
# Get european frequency by calculating the cumulative AF in the passed | ||||||
# europe genetic ancestry list | ||||||
p_ht = get_eur_freq(ht, eur_grps=grps, version=version) | ||||||
if version == "v4": | ||||||
ht = ht.annotate( | ||||||
grpmax_ga=ht.grpmax.gnomad.gen_anc, | ||||||
grpmax_af=ht.grpmax.gnomad.AF, | ||||||
) | ||||||
else: | ||||||
ht = ht.annotate( | ||||||
grpmax_ga=ht.popmax[0].pop, | ||||||
grpmax_af=ht.popmax[0].AF, | ||||||
) | ||||||
|
||||||
counts_by_thresholds = {} | ||||||
if eur_filter: | ||||||
logger.info( | ||||||
"Calculating grpmax counts for diverse genetic ancestry groups to " | ||||||
"deduplicate variant counts..." | ||||||
) | ||||||
# Iterate through just nfe group and all eur grp for calculating eur AF | ||||||
for grp_id, grps in EUR_GRPS.items(): | ||||||
# Get european frequency by calculating the cumulative AF in the passed | ||||||
# europe genetic ancestry list | ||||||
p_ht = get_eur_freq(ht, eur_grps=grps, version=version) | ||||||
|
||||||
counts_by_thresholds = {} | ||||||
for threshold in AF_THRESHOLDS: | ||||||
t_ht = filter_to_threshold( | ||||||
p_ht, threshold, version=version, eur_filter=eur_filter | ||||||
) | ||||||
t_ht = t_ht.checkpoint( | ||||||
f"gs://gnomad-tmp-4day/grpmax_comps_{version}_{grp_id}_{threshold}.ht", | ||||||
overwrite=True, | ||||||
) | ||||||
logger.info( | ||||||
"Number of variants after filtering grpmax AF threshold for %s" | ||||||
" at %s threshold in %s: %s ( %.2f%% of total variants)", | ||||||
grp_id, | ||||||
threshold, | ||||||
version, | ||||||
t_ht.count(), | ||||||
t_ht.count() / t_variants * 100, | ||||||
) | ||||||
|
||||||
t_ht = t_ht.filter( | ||||||
hl.literal(DIVERSE_GRPS).contains(t_ht.grpmax_ga) | ||||||
) | ||||||
|
||||||
# For each diverse genetic ancestry group, aggregate the number of | ||||||
# variants where that group is grpmax | ||||||
counts_by_thresholds[threshold] = t_ht.aggregate( | ||||||
hl.agg.counter(t_ht.grpmax_ga) | ||||||
) | ||||||
# Add a total count for all groups | ||||||
counts_by_thresholds[threshold]["all"] = t_ht.count() | ||||||
results_dict[grp_id] = counts_by_thresholds | ||||||
else: | ||||||
logger.info( | ||||||
"Aggregating grpmax variant counts for genetic ancestry groups..." | ||||||
) | ||||||
grps = deepcopy(DIVERSE_GRPS) + ["nfe"] | ||||||
for threshold in AF_THRESHOLDS: | ||||||
t_ht = filter_to_threshold(p_ht, threshold, version=version) | ||||||
t_ht = t_ht.checkpoint( | ||||||
f"gs://gnomad-tmp-4day/grpmax_comps_{version}_{grp_id}_{threshold}.ht", | ||||||
overwrite=True, | ||||||
t_ht = filter_to_threshold(ht, threshold, version=version) | ||||||
results_dict[threshold] = t_ht.aggregate( | ||||||
hl.dict( | ||||||
{grp: hl.agg.count_where(t_ht.grpmax_ga == grp) for grp in grps} | ||||||
) | ||||||
) | ||||||
logger.info( | ||||||
"Number of variants after filtering grpmax AF threshold for %s at" | ||||||
" %s threshold in %s: %s ( %.2f%% of total variants)", | ||||||
grp_id, | ||||||
threshold, | ||||||
version, | ||||||
t_ht.count(), | ||||||
t_ht.count() / t_variants * 100, | ||||||
) | ||||||
|
||||||
if version == "v4": | ||||||
t_ht = t_ht.annotate(grpmax_ga=t_ht.grpmax.gnomad.gen_anc) | ||||||
else: | ||||||
t_ht = t_ht.annotate(grpmax_ga=t_ht.popmax[0].pop) | ||||||
results_dict[threshold]["all"] = t_ht.count() | ||||||
|
||||||
t_ht = t_ht.filter(hl.literal(DIVERSE_GRPS).contains(t_ht.grpmax_ga)) | ||||||
|
||||||
# For each diverse genetic ancestry group, aggregate the number of | ||||||
# variants where that group is grpmax | ||||||
counts_by_thresholds[threshold] = t_ht.aggregate( | ||||||
hl.agg.counter(t_ht.grpmax_ga) | ||||||
) | ||||||
# Add a total count for all groups | ||||||
counts_by_thresholds[threshold]["all"] = t_ht.count() | ||||||
results_dict[grp_id] = counts_by_thresholds | ||||||
else: | ||||||
logger.info( | ||||||
"Aggregating variant counts for genetic ancestry groups, variants " | ||||||
|
@@ -172,6 +203,7 @@ def create_table( | |||||
data_subset: str = None, | ||||||
grpmax_counts: bool = False, | ||||||
non_syn_only: bool = False, | ||||||
eur_filter: bool = False, | ||||||
): | ||||||
""" | ||||||
Create tables of grpmax stats. | ||||||
|
@@ -181,19 +213,24 @@ def create_table( | |||||
:param version_dict: Dictionary of grpmax stats | ||||||
:param data_subset: Data subset | ||||||
:param grpmax_counts: Results calculated using only grpmax counts for diverse genetic ancestry groups | ||||||
:param non_syn_only: Keep only non-synonymous variants | ||||||
:param eur_filter: Filter to variants where eur AF is < threshold while grpmax > threshold | ||||||
""" | ||||||
headers = ["genetic ancestry group", "v2", "v4", "difference(v4-v2)"] | ||||||
v2_dict_index = ( | ||||||
version_dict["v2"][data_subset] if grpmax_counts else version_dict["v2"] | ||||||
version_dict["v2"][data_subset] | ||||||
if grpmax_counts and eur_filter | ||||||
else version_dict["v2"] | ||||||
) | ||||||
v4_dict_index = ( | ||||||
version_dict["v4"][data_subset] if grpmax_counts else version_dict["v4"] | ||||||
version_dict["v4"][data_subset] | ||||||
if grpmax_counts and eur_filter | ||||||
else version_dict["v4"] | ||||||
) | ||||||
grps = deepcopy(DIVERSE_GRPS) | ||||||
grps = grps + ["all"] | ||||||
|
||||||
if grpmax_counts: | ||||||
grps.append("all") | ||||||
else: | ||||||
if not eur_filter: | ||||||
grps.append("nfe") | ||||||
|
||||||
for threshold in AF_THRESHOLDS: | ||||||
|
@@ -209,8 +246,8 @@ def create_table( | |||||
table.append([grp, v2_val, v4_val, diff]) | ||||||
logger.info( | ||||||
f"\n{'non-synonymous' if non_syn_only else ''} variant count by genetic " | ||||||
f"ancestry group where the\n{'grpmax' if grpmax_counts else ''} AF is above" | ||||||
f" {threshold}{' and the ' + str(data_subset) + ' AF is below it' if grpmax_counts else ''}...\n" | ||||||
f"ancestry group where the{' grpmax' if grpmax_counts else ''} AF is above" | ||||||
f" {threshold}{' and the ' + str(data_subset) + ' AF is below it' if grpmax_counts and eur_filter else ''}...\n" | ||||||
f"{tabulate(table, headers=headers, tablefmt='fancy_grid')}" | ||||||
) | ||||||
|
||||||
|
@@ -220,12 +257,20 @@ def main(args): | |||||
version_dict = {} | ||||||
grpmax_counts = args.grpmax_counts | ||||||
non_syn_only = args.non_syn_only | ||||||
csq_terms = args.csq_terms | ||||||
eur_filter = args.eur_filter | ||||||
csq_terms = csq_terms if csq_terms else non_syn_only | ||||||
|
||||||
for version in ["v4", "v2"]: | ||||||
if version == "v4": | ||||||
ht = release_sites().ht() | ||||||
elif args.use_v2_liftover: | ||||||
ht = hl.read_table(get_gnomad_liftover_data_path("exomes")) | ||||||
vep_ht = vep_context.versions["105"].ht() | ||||||
ht = ht.annotate(vep=vep_ht[ht.key].vep) | ||||||
else: | ||||||
ht = get_gnomad_public_data("exomes") | ||||||
|
||||||
if args.test: | ||||||
# Filter to two partitions | ||||||
ht = ht._filter_partitions([0, 1]) | ||||||
|
@@ -235,32 +280,36 @@ def main(args): | |||||
"Total number of variants in %s before filtering: %s", version, t_variants | ||||||
) | ||||||
msg = "" | ||||||
# All MANE Select transcripts are canonical | ||||||
if args.canonical: | ||||||
logger.info("Filtering to only MANE Select and canonical transcripts...") | ||||||
ht = filter_vep_to_canonical_transcripts(ht, filter_empty_csq=True) | ||||||
msg = "canonical transcript filtering" | ||||||
|
||||||
if non_syn_only: | ||||||
logger.info("Filtering to keep only non-synonymous variants...") | ||||||
# NOTE: There is no garauntree the most severe consequence is from the | ||||||
# canonical transcript if table is filtered to canonical transcripts | ||||||
ht = ht.filter( | ||||||
hl.literal(NS_CONSEQ_TERMS).contains(ht.vep.most_severe_consequence) | ||||||
) | ||||||
ns_variants = ht.count() | ||||||
|
||||||
if msg: | ||||||
msg += " and " | ||||||
msg += "non-synonymous consequence filtering" | ||||||
ht = process_consequences(ht, has_polyphen=False) | ||||||
|
||||||
if csq_terms: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this doesn't need an if, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔎 |
||||||
logger.info( | ||||||
"Total number of variants in %s %s: %s (%.2f%% of total variants)", | ||||||
version, | ||||||
"after " + msg if msg else "", | ||||||
ns_variants, | ||||||
(ns_variants / t_variants) * 100, | ||||||
"Filtering to keep only %s ...", | ||||||
( | ||||||
str(csq_terms) + " variants" | ||||||
if csq_terms and not non_syn_only | ||||||
else "non-synonymous variants" | ||||||
), | ||||||
) | ||||||
if args.canonical: | ||||||
vep_csq_expr = ht.vep.worst_csq_for_variant_canonical | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ooh, nice |
||||||
msg += " on canonical transcripts" | ||||||
else: | ||||||
vep_csq_expr = ht.vep.worst_csq_for_variant | ||||||
msg += " on all transcripts" | ||||||
|
||||||
ht = ht.filter( | ||||||
hl.literal(csq_terms).contains(vep_csq_expr.most_severe_consequence) | ||||||
) | ||||||
ns_variants = ht.count() | ||||||
|
||||||
logger.info( | ||||||
"Total number of variants in %s %s: %s (%.2f%% of total variants)", | ||||||
version, | ||||||
"after " + msg if msg else "", | ||||||
ns_variants, | ||||||
(ns_variants / t_variants) * 100, | ||||||
) | ||||||
|
||||||
ht = ht.checkpoint( | ||||||
f"gs://gnomad-tmp-4day/grp_comps_{version}_canonical_non_syn.ht", | ||||||
|
@@ -291,18 +340,22 @@ def main(args): | |||||
t_variants=t_variants, | ||||||
version=version, | ||||||
grpmax_counts=grpmax_counts, | ||||||
eur_filter=eur_filter, | ||||||
) | ||||||
|
||||||
if grpmax_counts: | ||||||
if grpmax_counts and eur_filter: | ||||||
for eur_grp in EUR_GRPS.keys(): | ||||||
create_table( | ||||||
version_dict, | ||||||
data_subset=eur_grp, | ||||||
grpmax_counts=grpmax_counts, | ||||||
non_syn_only=non_syn_only, | ||||||
eur_filter=eur_filter, | ||||||
) | ||||||
else: | ||||||
create_table(version_dict, non_syn_only=non_syn_only) | ||||||
create_table( | ||||||
version_dict, grpmax_counts=grpmax_counts, non_syn_only=non_syn_only | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
also very minor -- just seems a little clearer to not specify either There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So I kept this in so you can run it without the eur filter and still get overall grpmax counts if you want, or if you dont care to deduplicate, you can still get the AF>threshold counts |
||||||
) | ||||||
|
||||||
|
||||||
if __name__ == "__main__": | ||||||
|
@@ -337,9 +390,30 @@ def main(args): | |||||
action="store_true", | ||||||
help="Filter to two partitions of each HT for testing purposes", | ||||||
) | ||||||
parser.add_argument( | ||||||
"--csq-terms", | ||||||
nargs="+", | ||||||
help="Consequence terms to consider", | ||||||
default=NS_CONSEQ_TERMS, | ||||||
) | ||||||
parser.add_argument( | ||||||
"--eur-filter", | ||||||
action="store_true", | ||||||
help="Filter to variants where eur AF is < threshold while grpmax > threshold", | ||||||
) | ||||||
parser.add_argument( | ||||||
"--use-v2-liftover", | ||||||
action="store_true", | ||||||
help="Use v2 liftover annotated with vep 105 for comparison", | ||||||
) | ||||||
|
||||||
args = parser.parse_args() | ||||||
if args.drop_hard_filtered and args.no_variant_qc_filters: | ||||||
raise ValueError( | ||||||
"Cannot use --drop-hard-filtered-variants and --all-variants together" | ||||||
) | ||||||
if args.non_syn_only and (args.csq_terms != NS_CONSEQ_TERMS): | ||||||
raise ValueError("Cannot use --non-syn-only and --csq-terms together") | ||||||
if args.eur_filter and not args.grpmax_counts: | ||||||
raise ValueError("Cannot use --eur-filter without --grpmax-counts") | ||||||
main(args) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just realized that this code block could likely get moved above for efficiency since we no longer need to log the counts after every filter: