Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vv ensembl dev susmi #660

Open
wants to merge 7 commits into
base: develop
Choose a base branch
from
25 changes: 24 additions & 1 deletion VariantValidator/modules/format_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,11 +637,21 @@ def intronic_converter(variant, validator, skip_check=False):

# Add the edited variant for next stage error processing e.g. exon boundaries.
variant.quibble = transy
# Expanding list of exceptions
try:
hgvs_transy = validator.hp.parse_hgvs_variant(transy)
except vvhgvs.exceptions.HGVSError:
# Allele syntax caught here
if "[" in transy and "]" in transy:
return genomic_ref
else:
raise

if skip_check is True:
return genomic_ref
else:
# Check the specified base is correct
hgvs_genomic = validator.nr_vm.c_to_g(validator.hp.parse_hgvs_variant(transy), genomic_ref,
hgvs_genomic = validator.nr_vm.c_to_g(hgvs_transy, genomic_ref,
alt_aln_method=validator.alt_aln_method)
try:
validator.vr.validate(hgvs_genomic)
Expand All @@ -652,6 +662,19 @@ def intronic_converter(variant, validator, skip_check=False):
else:
validator.vr.validate(hgvs_genomic)

# Check re-mapping of intronic variants
if hgvs_transy.posedit.pos.start.offset != 0 or hgvs_transy.posedit.pos.end.offset != 0:
try:
check_intronic_mapping = validator.nr_vm.g_to_t(hgvs_genomic, hgvs_transy.ac,
alt_aln_method=validator.alt_aln_method)
if check_intronic_mapping.posedit.pos == hgvs_transy.posedit.pos:
pass
else:
variant.warnings.append(f'ExonBoundaryError: {hgvs_transy.posedit.pos} does not match the exon '
f'boundaries for the alignment of {hgvs_transy.ac} to {hgvs_genomic.ac}')
except vvhgvs.exceptions.HGVSError:
pass

logger.debug("HVGS typesetting complete")


Expand Down
20 changes: 16 additions & 4 deletions VariantValidator/modules/gapped_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def gapped_g_to_c(self, rel_var, select_transcripts_dict):
"""
Gap aware projection from g. to c.
"""

# RefSeq or Ensembl?
expanded_genomic_for_ensembl = False
if self.validator.alt_aln_method == 'genebuild':
Expand Down Expand Up @@ -441,7 +442,7 @@ def gapped_g_to_c(self, rel_var, select_transcripts_dict):
self.validator.hp,
self.validator.vm,
self.validator.merge_hgvs_3pr,
genomic_ac=hgvs_genomic_variant.ac,)
genomic_ac=hgvs_genomic_variant.ac)

if vcf__dict['needs_a_push'] is True:
needs_a_push = True
Expand Down Expand Up @@ -687,7 +688,8 @@ def gapped_g_to_c(self, rel_var, select_transcripts_dict):
hgvs_not_delins = saved_hgvs_coding
self.disparity_deletion_in = ['false', 'false']
elif 'Normalization of intronic variants is not supported' in error:
# We know that this cannot be because of an intronic variant, so must be aligned to tx gap
# We know that this cannot be because of an intronic variant, so must be aligned to
# tx gap
self.disparity_deletion_in = ['transcript', 'Requires Analysis']
logger.info(error)

Expand All @@ -703,7 +705,6 @@ def gapped_g_to_c(self, rel_var, select_transcripts_dict):

# GAP IN THE TRANSCRIPT DISPARITY DETECTED
if self.disparity_deletion_in[0] == 'transcript':

# Check for issue https://github.com/openvar/variantValidator/issues/385 where the gap is
# being identified but oddly the vm is not compensating, likely due to odd sequence
try:
Expand All @@ -723,7 +724,8 @@ def gapped_g_to_c(self, rel_var, select_transcripts_dict):
# the actual length should be == gen_len_difference - 1 not == gen_len_difference
if tx_len_difference - self.disparity_deletion_in[1] == gen_len_difference:
# So here we know we need to knock off disparity_deletion_in[1] bases
if len(hgvs_not_delins.posedit.edit.alt) == len(self.tx_hgvs_not_delins.posedit.edit.alt):
if len(hgvs_not_delins.posedit.edit.alt) == len(
self.tx_hgvs_not_delins.posedit.edit.alt):
if self.orientation == 1:
self.tx_hgvs_not_delins.posedit.edit.ref = hgvs_not_delins.posedit.ref
else:
Expand Down Expand Up @@ -2814,6 +2816,7 @@ def transcript_disparity(self, reverse_normalized_hgvs_genomic, stored_hgvs_not_

if self.tx_hgvs_not_delins.posedit.pos.start.offset == 0 and \
self.tx_hgvs_not_delins.posedit.pos.end.offset == 0:

# In this instance, we have identified a transcript gap but the n. version of
# the transcript variant but do not have a position which actually hits the gap,
# so the variant likely spans the gap, and is not picked up by an offset.
Expand All @@ -2832,6 +2835,15 @@ def transcript_disparity(self, reverse_normalized_hgvs_genomic, stored_hgvs_not_
hgvs_refreshed_variant = self.tx_hgvs_not_delins
return hgvs_refreshed_variant

elif (((g3.posedit.pos.end.base - g3.posedit.pos.start.base) >
(hgvs_genomic_norm.posedit.pos.end.base - hgvs_genomic_norm.posedit.pos.start.base)) and
hgvs_genomic_norm.posedit.edit.type == 'del' and
g3.posedit.pos.start.base < hgvs_genomic_norm.posedit.pos.start.base and
(g3.posedit.pos.end.base + int(self.disparity_deletion_in[1])) <
hgvs_genomic_norm.posedit.pos.end.base):
hgvs_refreshed_variant = self.tx_hgvs_not_delins
return hgvs_refreshed_variant

g3.posedit.pos.end.base = g3.posedit.pos.start.base + (len(g3.posedit.edit.ref) - 1)
try:
c2 = self.validator.vm.g_to_t(g3, c1.ac, alt_aln_method=self.validator.alt_aln_method)
Expand Down
39 changes: 30 additions & 9 deletions VariantValidator/modules/gene2transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from . import utils as fn
from . import seq_data


# Pre compile variables
vvhgvs.global_config.formatting.max_ref_length = 1000000

Expand Down Expand Up @@ -53,11 +54,15 @@ def gene2transcripts(g2t, query, validator=False, bypass_web_searches=False, sel
query = g2t.db.get_stable_gene_id_from_hgnc_id(query)[1]
if query == "No data":
try:
query = validator.db.get_transcripts_from_annotations(store_query)
for tx in query:
if tx[5] != "unassigned":
query = tx[5]
break
query = g2t.db.get_transcripts_from_annotations(store_query)
if "none" not in query[0]:
for tx in query:
if tx[5] != "unassigned":
query = tx[5]
break
else:
return {'error': 'Unable to recognise HGNC ID. Please provide a gene symbol',
"requested_symbol": store_query}
except TypeError:
pass

Expand Down Expand Up @@ -118,7 +123,6 @@ def gene2transcripts(g2t, query, validator=False, bypass_web_searches=False, sel
g2t.hdp.get_tx_identity_info(refresh_hgnc)
tx_found = refresh_hgnc
found_res = True
break
except vvhgvs.exceptions.HGVSError as e:
logger.debug("Except passed, %s", e)
if not found_res:
Expand All @@ -139,8 +143,14 @@ def gene2transcripts(g2t, query, validator=False, bypass_web_searches=False, sel
except vvhgvs.exceptions.HGVSError as e:
return {'error': str(e),
"requested_symbol": query}

hgnc = tx_info[6]
hgnc = g2t.db.get_hgnc_symbol(hgnc)
hgnc2 = g2t.db.get_hgnc_symbol(hgnc)

if re.match("LOC", hgnc2) and not re.match("LOC", hgnc):
hgnc = hgnc
else:
hgnc = hgnc2

# First perform a search against the input gene symbol or the symbol inferred from UTA
symbol_identified = False
Expand All @@ -165,10 +175,21 @@ def gene2transcripts(g2t, query, validator=False, bypass_web_searches=False, sel
hgnc_id = vvta_record[0][0]
previous_sym = hgnc
symbol_identified = True
if len(vvta_record) > 1:
elif len(vvta_record) > 1:
return {'error': '%s is a previous symbol for %s genes. '
'Refer to https://www.genenames.org/' % (current_sym, str(len(vvta_record))),
'Refer to https://www.genenames.org/' % (hgnc, str(len(vvta_record))),
"requested_symbol": query}
else:
# Is it an updated symbol?
old_symbol = g2t.db.get_uta_symbol(hgnc)
if old_symbol is not None:
vvta_record = g2t.hdp.get_gene_info(old_symbol)
if vvta_record is not None:
current_sym = hgnc
gene_name = vvta_record[3]
hgnc_id = vvta_record[0]
previous_sym = old_symbol
symbol_identified = True

if symbol_identified is False:
return {'error': 'Unable to recognise gene symbol %s' % hgnc,
Expand Down
46 changes: 36 additions & 10 deletions VariantValidator/modules/hgvs_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,6 @@ def hard_right_hgvs2vcf(hgvs_genomic, primary_assembly, hn, reverse_normalizer,
:param genomic_ac:
:return:
"""

# c. must be in n. format
try:
hgvs_genomic = vm.c_to_n(hgvs_genomic) # Need in n. context
Expand Down Expand Up @@ -1057,6 +1056,27 @@ def hard_right_hgvs2vcf(hgvs_genomic, primary_assembly, hn, reverse_normalizer,
else:
end_seq_check_mapped = vm.g_to_n(end_seq_check_variant, tx_ac)

# Look for flank subs that may be missed when naieve mapping c > c made a delins from a sub
# This is a hgvs.py quirl for flanking subs in the antisense oriemntation and refers to
# https://github.com/openvar/variantValidator/issues/651
try:
normalized_end_seq_check_mapped = hn.normalize(end_seq_check_mapped)
normalized_end_seq_check_variant = hn.normalize(end_seq_check_variant)
if (normalized_end_seq_check_mapped.type == 'g' and
normalized_end_seq_check_variant.type == 'n' and
normalized_end_seq_check_mapped.posedit.edit.type == 'sub' and
normalized_end_seq_check_variant == normalized_hgvs_genomic):

# double check the original mapping
sub_map = vm.g_to_t(normalized_end_seq_check_mapped, normalized_end_seq_check_variant.ac)
if sub_map.posedit.edit.type == 'sub':
needs_a_push = True
merged_variant = sub_map
identifying_g_variant = end_seq_check_mapped
break
except vvhgvs.exceptions.HGVSUnsupportedOperationError:
pass

# For genomic_variant mapped onto gaps, we end up with an offset
start_offset = False
end_offset = False
Expand Down Expand Up @@ -2257,15 +2277,21 @@ def hgvs_ref_alt(hgvs_variant, sf):
ref_alt_dict = {'ref': ref, 'alt': alt}
return ref_alt_dict

#
# def hgvs_to_delins(hgvs_variant_object):
# """
# Refer to https://github.com/openvar/vv_hgvs/blob/master/examples/creating-a-variant.ipynb
# :param hgvs_variant_object:
# :return: hgvs_variant_object in raw type = "delins" state
# """
#
#

def incomplete_alignment_mapping_t_to_g(validator, variant):
output = None
mapping_options = validator.hdp.get_tx_mapping_options(variant.input_parses.ac)
for option in mapping_options:
if option[2] == validator.alt_aln_method and "NC_" not in option[1]:
in_assembly = seq_data.to_chr_num_refseq(option[1], variant.primary_assembly)
if in_assembly is not None:
try:
output = validator.vm.t_to_g(variant.input_parses, option[1])
if variant.input_parses.posedit.edit.type == "identity":
output.posedit.edit.alt = output.posedit.edit.ref
except vvhgvs.exceptions.HGVSError:
pass
return output

# <LICENSE>
# Copyright (C) 2016-2024 VariantValidator Contributors
Expand Down
61 changes: 50 additions & 11 deletions VariantValidator/modules/mappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,13 +240,12 @@ def transcripts_to_gene(variant, validator, select_transcripts_dict_plus_version
# Se rec_var to '' so it can be updated later
rec_var = ''

# First task is to get the genomic equivalent, and pri nt useful error messages if it can't be found.
# First task is to get the genomic equivalent, and print useful error messages if it can't be found.
try:
to_g = validator.myevm_t_to_g(obj, variant.no_norm_evm, variant.primary_assembly, variant.hn)
genomic_ac = to_g.ac
except vvhgvs.exceptions.HGVSDataNotAvailableError as e:
errors = []

if ('~' in str(e) and 'Alignment is incomplete' in str(e)) or "No relevant genomic mapping options" in str(e):
# Unable to map the input variant onto a genomic position
if '~' in str(e) and 'Alignment is incomplete' in str(e):
Expand Down Expand Up @@ -312,7 +311,19 @@ def transcripts_to_gene(variant, validator, select_transcripts_dict_plus_version
else:
# Normalize was I believe to replace ref. Mapping does this anyway
# to_g = variant.hn.normalize(to_g)
formatted_variant = str(validator.myevm_g_to_t(variant.evm, to_g, tx_ac))
try:
formatted_variant = str(validator.myevm_g_to_t(variant.evm, to_g, tx_ac))
except vvhgvs.exceptions.HGVSError as e:
if "Alignment is incomplete" in str(e):
to_g = hgvs_utils.incomplete_alignment_mapping_t_to_g(validator, variant)
if to_g is None:
error = str(e)
variant.warnings.append(error)
logger.warning(error)
return True
else:
variant.hgvc_genomic = to_g
formatted_variant = str(validator.vm.g_to_t(to_g, tx_ac))

elif ':g.' in quibble_input:
if plus.search(formatted_variant) or minus.search(formatted_variant):
Expand Down Expand Up @@ -388,7 +399,6 @@ def transcripts_to_gene(variant, validator, select_transcripts_dict_plus_version
# hgvs will handle incorrect coordinates so need to automap errors
# Make sure any input intronic coordinates are correct
# Get the desired transcript

if cck:
# This should only ever hit coding variants (RNA has been converted to c by now)
if 'del' in formatted_variant:
Expand All @@ -414,9 +424,20 @@ def transcripts_to_gene(variant, validator, select_transcripts_dict_plus_version
try:
post_var = validator.myevm_g_to_t(variant.evm, pre_var, trans_acc)
except vvhgvs.exceptions.HGVSError as error:
variant.warnings.append(str(error))
logger.warning(str(error))
return True
if "Alignment is incomplete" in str(error):
output = hgvs_utils.incomplete_alignment_mapping_t_to_g(validator, variant)
if output is None:
error = str(error)
variant.warnings.append(error)
logger.warning(error)
return True
else:
post_var = validator.vm.g_to_t(output, tx_ac)
variant.hgvs_genomic = output
else:
variant.warnings.append(str(error))
logger.warning(str(error))
return True

test = validator.hp.parse_hgvs_variant(quibble_input)
if post_var.posedit.pos.start.base != test.posedit.pos.start.base or \
Expand Down Expand Up @@ -467,7 +488,20 @@ def transcripts_to_gene(variant, validator, select_transcripts_dict_plus_version
variant.hn)

# genome back to C coordinates
post_var = validator.myevm_g_to_t(variant.evm, pre_var, trans_acc)
try:
post_var = validator.myevm_g_to_t(variant.evm, pre_var, trans_acc)
except vvhgvs.exceptions.HGVSError as e:
if "Alignment is incomplete" in str(e):
pre_var = hgvs_utils.incomplete_alignment_mapping_t_to_g(validator, variant)
if to_g is None:
error = str(e)
variant.warnings.append(error)
logger.warning(error)
return True
else:
post_var = validator.vm.g_to_t(pre_var, tx_ac)
variant.hgvs_genomic = pre_var

test = validator.hp.parse_hgvs_variant(quibble_input)
if post_var.posedit.pos.start.base != test.posedit.pos.start.base or \
post_var.posedit.pos.end.base != test.posedit.pos.end.base:
Expand Down Expand Up @@ -590,7 +624,12 @@ def transcripts_to_gene(variant, validator, select_transcripts_dict_plus_version
logger.debug("gap_compensation_1 = " + str(gap_compensation))

# Genomic sequence
hgvs_genomic = validator.myevm_t_to_g(hgvs_coding, variant.no_norm_evm, variant.primary_assembly, variant.hn)
if variant.hgvs_genomic is not None:
hgvs_genomic = variant.hgvs_genomic
else:
hgvs_genomic = validator.myevm_t_to_g(hgvs_coding, variant.no_norm_evm, variant.primary_assembly, variant.hn)



# Create gap_mapper object instance
gap_mapper = gapped_mapping.GapMapper(variant, validator)
Expand Down Expand Up @@ -875,12 +914,12 @@ def final_tx_to_multiple_genomic(variant, validator, tx_variant, liftover_level=
ori = validator.tx_exons(tx_ac=variant.hgvs_coding.ac, alt_ac=alt_chr,
alt_aln_method=validator.alt_aln_method)

# Map the variant to the genome
hgvs_alt_genomic = validator.myvm_t_to_g(variant.hgvs_coding, alt_chr, variant.no_norm_evm, variant.hn)

gap_mapper = gapped_mapping.GapMapper(variant, validator)

# Loop out gap code under these circumstances!
if gap_compensation:
gap_mapper = gapped_mapping.GapMapper(variant, validator)
hgvs_alt_genomic, hgvs_coding = gap_mapper.g_to_t_gap_compensation_version3(
hgvs_alt_genomic, variant.hgvs_coding, ori, alt_chr, rec_var)
variant.hgvs_coding = hgvs_coding
Expand Down
Loading