Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

grpmax vs European AF split nonsynonymous variant analysis #597

Merged
merged 18 commits into from
Jun 20, 2024
Merged
Changes from 5 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 151 additions & 77 deletions gnomad_qc/v4/analyses/grpmax_comps.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@
from typing import Dict

import hail as hl
from gnomad.resources.grch38.reference_data import vep_context
from gnomad.utils.vep import (
CSQ_CODING_HIGH_IMPACT,
CSQ_CODING_MEDIUM_IMPACT,
filter_vep_to_canonical_transcripts,
process_consequences,
)
from tabulate import tabulate

from gnomad_qc.v2.resources.basics import get_gnomad_public_data
from gnomad_qc.v2.resources.basics import (
get_gnomad_liftover_data_path,
get_gnomad_public_data,
)
from gnomad_qc.v4.resources.release import release_sites

logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
Expand All @@ -25,7 +29,7 @@
DIVERSE_GRPS = ["afr", "amr", "eas", "mid", "sas"]
EUR_GRPS = {"all_eur": ["nfe", "fin", "asj"], "nfe_only": ["nfe"]}
FILTER_VALUES_TO_DROP = hl.array(["AC0", "InbreedingCoeff"])
AF_THRESHOLDS = [0.0001, 0.001]
AF_THRESHOLDS = [0.0001, 0.001, 0.01]
NS_CONSEQ_TERMS = CSQ_CODING_HIGH_IMPACT + CSQ_CODING_MEDIUM_IMPACT


Expand Down Expand Up @@ -61,9 +65,7 @@ def get_eur_freq(ht: hl.Table, eur_grps: list, version: str = "v4"):


def filter_to_threshold(
ht: hl.Table,
af_threshold: float = 0.001,
version: str = "v4",
ht: hl.Table, af_threshold: float = 0.001, version: str = "v4", eur_filter=False
):
"""
Filter to variants where eur AF is < threshold while grpmax > threshold.
Expand All @@ -77,7 +79,10 @@ def filter_to_threshold(
grpmax_expr = ht.grpmax.gnomad
else:
grpmax_expr = ht.popmax[0]
ht = ht.filter((grpmax_expr.AF > af_threshold) & (ht.eur_AF < af_threshold))
filter_expr = grpmax_expr.AF > af_threshold
if eur_filter:
filter_expr &= ht.eur_AF < af_threshold
ht = ht.filter(filter_expr)
return ht


Expand All @@ -86,6 +91,7 @@ def version_stats(
t_variants: int,
version: str = "v4",
grpmax_counts: bool = False,
eur_filter: bool = False,
) -> Dict[str, Dict[str, Dict[str, int]]]:
"""
Calculate grpmax stats for a given gnomAD version.
Expand All @@ -99,48 +105,73 @@ def version_stats(
results_dict = {}

if grpmax_counts:
logger.info(
"Calculating grpmax counts for diverse genetic ancestry groups to "
"deduplicate variant counts..."
)
# Iterate through just nfe group and all eur grp for calculating eur AF
for grp_id, grps in EUR_GRPS.items():
# Get european frequency by calculating the cumulative AF in the passed
# europe genetic ancestry list
p_ht = get_eur_freq(ht, eur_grps=grps, version=version)
if version == "v4":
ht = ht.annotate(
grpmax_ga=ht.grpmax.gnomad.gen_anc,
grpmax_af=ht.grpmax.gnomad.AF,
)
else:
ht = ht.annotate(
grpmax_ga=ht.popmax[0].pop,
grpmax_af=ht.popmax[0].AF,
)

counts_by_thresholds = {}
if eur_filter:
logger.info(
"Calculating grpmax counts for diverse genetic ancestry groups to "
"deduplicate variant counts..."
)
# Iterate through just nfe group and all eur grp for calculating eur AF
for grp_id, grps in EUR_GRPS.items():
# Get european frequency by calculating the cumulative AF in the passed
# europe genetic ancestry list
p_ht = get_eur_freq(ht, eur_grps=grps, version=version)

counts_by_thresholds = {}
for threshold in AF_THRESHOLDS:
t_ht = filter_to_threshold(
p_ht, threshold, version=version, eur_filter=eur_filter
)
t_ht = t_ht.checkpoint(
f"gs://gnomad-tmp-4day/grpmax_comps_{version}_{grp_id}_{threshold}.ht",
overwrite=True,
)
logger.info(
"Number of variants after filtering grpmax AF threshold for %s"
" at %s threshold in %s: %s ( %.2f%% of total variants)",
grp_id,
threshold,
version,
t_ht.count(),
t_ht.count() / t_variants * 100,
)

t_ht = t_ht.filter(
hl.literal(DIVERSE_GRPS).contains(t_ht.grpmax_ga)
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just realized that this code block could likely get moved above for efficiency since we no longer need to log the counts after every filter:

t_ht = filter_to_threshold(
    p_ht, threshold, version=version, eur_filter=eur_filter
)
t_ht = t_ht.filter(
    hl.literal(DIVERSE_GRPS).contains(t_ht.grpmax_ga)
)
t_ht = t_ht.checkpoint(
    f"gs://gnomad-tmp-4day/grpmax_comps_{version}_{grp_id}_{threshold}.ht",
    overwrite=True,
)


# For each diverse genetic ancestry group, aggregate the number of
# variants where that group is grpmax
counts_by_thresholds[threshold] = t_ht.aggregate(
hl.agg.counter(t_ht.grpmax_ga)
)
# Add a total count for all groups
counts_by_thresholds[threshold]["all"] = t_ht.count()
results_dict[grp_id] = counts_by_thresholds
else:
logger.info(
"Aggregating grpmax variant counts for genetic ancestry groups..."
)
grps = deepcopy(DIVERSE_GRPS) + ["nfe"]
for threshold in AF_THRESHOLDS:
t_ht = filter_to_threshold(p_ht, threshold, version=version)
t_ht = t_ht.checkpoint(
f"gs://gnomad-tmp-4day/grpmax_comps_{version}_{grp_id}_{threshold}.ht",
overwrite=True,
t_ht = filter_to_threshold(ht, threshold, version=version)
results_dict[threshold] = t_ht.aggregate(
hl.dict(
{grp: hl.agg.count_where(t_ht.grpmax_ga == grp) for grp in grps}
)
)
logger.info(
"Number of variants after filtering grpmax AF threshold for %s at"
" %s threshold in %s: %s ( %.2f%% of total variants)",
grp_id,
threshold,
version,
t_ht.count(),
t_ht.count() / t_variants * 100,
)

if version == "v4":
t_ht = t_ht.annotate(grpmax_ga=t_ht.grpmax.gnomad.gen_anc)
else:
t_ht = t_ht.annotate(grpmax_ga=t_ht.popmax[0].pop)
results_dict[threshold]["all"] = t_ht.count()

t_ht = t_ht.filter(hl.literal(DIVERSE_GRPS).contains(t_ht.grpmax_ga))

# For each diverse genetic ancestry group, aggregate the number of
# variants where that group is grpmax
counts_by_thresholds[threshold] = t_ht.aggregate(
hl.agg.counter(t_ht.grpmax_ga)
)
# Add a total count for all groups
counts_by_thresholds[threshold]["all"] = t_ht.count()
results_dict[grp_id] = counts_by_thresholds
else:
logger.info(
"Aggregating variant counts for genetic ancestry groups, variants "
Expand Down Expand Up @@ -172,6 +203,7 @@ def create_table(
data_subset: str = None,
grpmax_counts: bool = False,
non_syn_only: bool = False,
eur_filter: bool = False,
):
"""
Create tables of grpmax stats.
Expand All @@ -181,19 +213,24 @@ def create_table(
:param version_dict: Dictionary of grpmax stats
:param data_subset: Data subset
:param grpmax_counts: Results calculated using only grpmax counts for diverse genetic ancestry groups
:param non_syn_only: Keep only non-synonymous variants
:param eur_filter: Filter to variants where eur AF is < threshold while grpmax > threshold
"""
headers = ["genetic ancestry group", "v2", "v4", "difference(v4-v2)"]
v2_dict_index = (
version_dict["v2"][data_subset] if grpmax_counts else version_dict["v2"]
version_dict["v2"][data_subset]
if grpmax_counts and eur_filter
else version_dict["v2"]
)
v4_dict_index = (
version_dict["v4"][data_subset] if grpmax_counts else version_dict["v4"]
version_dict["v4"][data_subset]
if grpmax_counts and eur_filter
else version_dict["v4"]
)
grps = deepcopy(DIVERSE_GRPS)
grps = grps + ["all"]

if grpmax_counts:
grps.append("all")
else:
if not eur_filter:
grps.append("nfe")

for threshold in AF_THRESHOLDS:
Expand All @@ -209,8 +246,8 @@ def create_table(
table.append([grp, v2_val, v4_val, diff])
logger.info(
f"\n{'non-synonymous' if non_syn_only else ''} variant count by genetic "
f"ancestry group where the\n{'grpmax' if grpmax_counts else ''} AF is above"
f" {threshold}{' and the ' + str(data_subset) + ' AF is below it' if grpmax_counts else ''}...\n"
f"ancestry group where the{' grpmax' if grpmax_counts else ''} AF is above"
f" {threshold}{' and the ' + str(data_subset) + ' AF is below it' if grpmax_counts and eur_filter else ''}...\n"
f"{tabulate(table, headers=headers, tablefmt='fancy_grid')}"
)

Expand All @@ -220,12 +257,20 @@ def main(args):
version_dict = {}
grpmax_counts = args.grpmax_counts
non_syn_only = args.non_syn_only
csq_terms = args.csq_terms
eur_filter = args.eur_filter
csq_terms = csq_terms if csq_terms else non_syn_only

for version in ["v4", "v2"]:
if version == "v4":
ht = release_sites().ht()
elif args.use_v2_liftover:
ht = hl.read_table(get_gnomad_liftover_data_path("exomes"))
vep_ht = vep_context.versions["105"].ht()
ht = ht.annotate(vep=vep_ht[ht.key].vep)
else:
ht = get_gnomad_public_data("exomes")

if args.test:
# Filter to two partitions
ht = ht._filter_partitions([0, 1])
Expand All @@ -235,32 +280,36 @@ def main(args):
"Total number of variants in %s before filtering: %s", version, t_variants
)
msg = ""
# All MANE Select transcripts are canonical
if args.canonical:
logger.info("Filtering to only MANE Select and canonical transcripts...")
ht = filter_vep_to_canonical_transcripts(ht, filter_empty_csq=True)
msg = "canonical transcript filtering"

if non_syn_only:
logger.info("Filtering to keep only non-synonymous variants...")
# NOTE: There is no garauntree the most severe consequence is from the
# canonical transcript if table is filtered to canonical transcripts
ht = ht.filter(
hl.literal(NS_CONSEQ_TERMS).contains(ht.vep.most_severe_consequence)
)
ns_variants = ht.count()

if msg:
msg += " and "
msg += "non-synonymous consequence filtering"
ht = process_consequences(ht, has_polyphen=False)

if csq_terms:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doesn't need an if, right? csq_terms should always be defined based on the logic above

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔎

logger.info(
"Total number of variants in %s %s: %s (%.2f%% of total variants)",
version,
"after " + msg if msg else "",
ns_variants,
(ns_variants / t_variants) * 100,
"Filtering to keep only %s ...",
(
str(csq_terms) + " variants"
if csq_terms and not non_syn_only
else "non-synonymous variants"
),
)
if args.canonical:
vep_csq_expr = ht.vep.worst_csq_for_variant_canonical
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ooh, nice

msg += " on canonical transcripts"
else:
vep_csq_expr = ht.vep.worst_csq_for_variant
msg += " on all transcripts"

ht = ht.filter(
hl.literal(csq_terms).contains(vep_csq_expr.most_severe_consequence)
)
ns_variants = ht.count()

logger.info(
"Total number of variants in %s %s: %s (%.2f%% of total variants)",
version,
"after " + msg if msg else "",
ns_variants,
(ns_variants / t_variants) * 100,
)

ht = ht.checkpoint(
f"gs://gnomad-tmp-4day/grp_comps_{version}_canonical_non_syn.ht",
Expand Down Expand Up @@ -291,18 +340,22 @@ def main(args):
t_variants=t_variants,
version=version,
grpmax_counts=grpmax_counts,
eur_filter=eur_filter,
)

if grpmax_counts:
if grpmax_counts and eur_filter:
for eur_grp in EUR_GRPS.keys():
create_table(
version_dict,
data_subset=eur_grp,
grpmax_counts=grpmax_counts,
non_syn_only=non_syn_only,
eur_filter=eur_filter,
)
else:
create_table(version_dict, non_syn_only=non_syn_only)
create_table(
version_dict, grpmax_counts=grpmax_counts, non_syn_only=non_syn_only
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
version_dict, grpmax_counts=grpmax_counts, non_syn_only=non_syn_only
version_dict, non_syn_only=non_syn_only

also very minor -- just seems a little clearer to not specify either grpmax_counts or eur_filter based on the if/else logic

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I kept this in so you can run it without the eur filter and still get overall grpmax counts if you want, or if you dont care to deduplicate, you can still get the AF>threshold counts

)


if __name__ == "__main__":
Expand Down Expand Up @@ -337,9 +390,30 @@ def main(args):
action="store_true",
help="Filter to two partitions of each HT for testing purposes",
)
parser.add_argument(
"--csq-terms",
nargs="+",
help="Consequence terms to consider",
default=NS_CONSEQ_TERMS,
)
parser.add_argument(
"--eur-filter",
action="store_true",
help="Filter to variants where eur AF is < threshold while grpmax > threshold",
)
parser.add_argument(
"--use-v2-liftover",
action="store_true",
help="Use v2 liftover annotated with vep 105 for comparison",
)

args = parser.parse_args()
if args.drop_hard_filtered and args.no_variant_qc_filters:
raise ValueError(
"Cannot use --drop-hard-filtered-variants and --all-variants together"
)
if args.non_syn_only and (args.csq_terms != NS_CONSEQ_TERMS):
raise ValueError("Cannot use --non-syn-only and --csq-terms together")
if args.eur_filter and not args.grpmax_counts:
raise ValueError("Cannot use --eur-filter without --grpmax-counts")
main(args)
Loading