From 4e2e82ee51655307cbd1b0a15ac79f126fbf95cb Mon Sep 17 00:00:00 2001 From: Peter-J-Freeman Date: Mon, 2 Dec 2024 15:21:50 +0000 Subject: [PATCH 1/7] Fixes that overcome NR transcripts with LOC based gene symbols in genes2transcripts functionality --- VariantValidator/modules/gene2transcripts.py | 9 +++++++-- tests/test_inputs.py | 11 +++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/VariantValidator/modules/gene2transcripts.py b/VariantValidator/modules/gene2transcripts.py index 7dcd7523..6ab426ea 100644 --- a/VariantValidator/modules/gene2transcripts.py +++ b/VariantValidator/modules/gene2transcripts.py @@ -118,7 +118,6 @@ def gene2transcripts(g2t, query, validator=False, bypass_web_searches=False, sel g2t.hdp.get_tx_identity_info(refresh_hgnc) tx_found = refresh_hgnc found_res = True - break except vvhgvs.exceptions.HGVSError as e: logger.debug("Except passed, %s", e) if not found_res: @@ -139,8 +138,14 @@ def gene2transcripts(g2t, query, validator=False, bypass_web_searches=False, sel except vvhgvs.exceptions.HGVSError as e: return {'error': str(e), "requested_symbol": query} + hgnc = tx_info[6] - hgnc = g2t.db.get_hgnc_symbol(hgnc) + hgnc2 = g2t.db.get_hgnc_symbol(hgnc) + + if re.match("LOC", hgnc2) and not re.match("LOC", hgnc): + hgnc = hgnc + else: + hgnc = hgnc2 # First perform a search against the input gene symbol or the symbol inferred from UTA symbol_identified = False diff --git a/tests/test_inputs.py b/tests/test_inputs.py index 658b9245..cd17e78f 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -31032,6 +31032,17 @@ def test_issue_645d(self): results['NM_030582.3:c.3364_3365del'][ 'primary_assembly_loci']['grch37']['hgvs_genomic_description'] + def test_issue_651a(self): + variant = 'NM_017680.6:c.153G>T' + select_transcripts = 'all' + results = self.vv.validate(variant, 'GRCh38', select_transcripts).format_as_dict(test=True) + assert 'NC_000009.12:g.92474742delinsATCA' in \ + results['NM_017680.6:c.153G>T'][ + 'primary_assembly_loci']['grch38']['hgvs_genomic_description'] + assert 'NC_000009.11:g.95237024delinsATCA' in \ + results['NM_017680.6:c.153G>T'][ + 'primary_assembly_loci']['grch37']['hgvs_genomic_description'] + # # Copyright (C) 2016-2024 VariantValidator Contributors # From dbfb7b17fc89cc12c2d73363a50e748c736e2c71 Mon Sep 17 00:00:00 2001 From: Peter-J-Freeman Date: Tue, 3 Dec 2024 12:43:57 +0000 Subject: [PATCH 2/7] Fixes the mapping of NC_000009.12:g.92474742delinsATCA back to NM_017680.6:c.153G>T in issue https://github.com/openvar/variantValidator/issues/651 --- VariantValidator/modules/gapped_mapping.py | 1 + VariantValidator/modules/hgvs_utils.py | 22 +++++++++++++++++++++- tests/test_inputs.py | 11 +++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/VariantValidator/modules/gapped_mapping.py b/VariantValidator/modules/gapped_mapping.py index 54c815ef..bc969cb5 100644 --- a/VariantValidator/modules/gapped_mapping.py +++ b/VariantValidator/modules/gapped_mapping.py @@ -157,6 +157,7 @@ def gapped_g_to_c(self, rel_var, select_transcripts_dict): """ Gap aware projection from g. to c. """ + # RefSeq or Ensembl? expanded_genomic_for_ensembl = False if self.validator.alt_aln_method == 'genebuild': diff --git a/VariantValidator/modules/hgvs_utils.py b/VariantValidator/modules/hgvs_utils.py index fc1e109f..09376419 100644 --- a/VariantValidator/modules/hgvs_utils.py +++ b/VariantValidator/modules/hgvs_utils.py @@ -716,7 +716,6 @@ def hard_right_hgvs2vcf(hgvs_genomic, primary_assembly, hn, reverse_normalizer, :param genomic_ac: :return: """ - # c. must be in n. format try: hgvs_genomic = vm.c_to_n(hgvs_genomic) # Need in n. context @@ -1057,6 +1056,27 @@ def hard_right_hgvs2vcf(hgvs_genomic, primary_assembly, hn, reverse_normalizer, else: end_seq_check_mapped = vm.g_to_n(end_seq_check_variant, tx_ac) + # Look for flank subs that may be missed when naieve mapping c > c made a delins from a sub + # This is a hgvs.py quirl for flanking subs in the antisense oriemntation and refers to + # + try: + normalized_end_seq_check_mapped = hn.normalize(end_seq_check_mapped) + normalized_end_seq_check_variant = hn.normalize(end_seq_check_variant) + if (normalized_end_seq_check_mapped.type == 'g' and + normalized_end_seq_check_variant.type == 'n' and + normalized_end_seq_check_mapped.posedit.edit.type == 'sub' and + normalized_end_seq_check_variant == normalized_hgvs_genomic): + + # double check the original mapping + sub_map = vm.g_to_t(normalized_end_seq_check_mapped, normalized_end_seq_check_variant.ac) + if sub_map.posedit.edit.type == 'sub': + needs_a_push = True + merged_variant = sub_map + identifying_g_variant = end_seq_check_mapped + break + except vvhgvs.exceptions.HGVSUnsupportedOperationError: + pass + # For genomic_variant mapped onto gaps, we end up with an offset start_offset = False end_offset = False diff --git a/tests/test_inputs.py b/tests/test_inputs.py index cd17e78f..1558486a 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -31043,6 +31043,17 @@ def test_issue_651a(self): results['NM_017680.6:c.153G>T'][ 'primary_assembly_loci']['grch37']['hgvs_genomic_description'] + def test_issue_651b(self): + variant = 'NC_000009.12:g.92474742delinsATCA' + select_transcripts = 'NM_017680.6' + results = self.vv.validate(variant, 'GRCh38', select_transcripts).format_as_dict(test=True) + assert 'NC_000009.12:g.92474742delinsATCA' in \ + results['NM_017680.6:c.153G>T'][ + 'primary_assembly_loci']['grch38']['hgvs_genomic_description'] + assert 'NC_000009.11:g.95237024delinsATCA' in \ + results['NM_017680.6:c.153G>T'][ + 'primary_assembly_loci']['grch37']['hgvs_genomic_description'] + # # Copyright (C) 2016-2024 VariantValidator Contributors # From 033d9483f866f7bdd6cf004af228e1f3430939cf Mon Sep 17 00:00:00 2001 From: Peter-J-Freeman Date: Wed, 4 Dec 2024 12:32:20 +0000 Subject: [PATCH 3/7] Add tweaks to genes2transcripts to handle gene symbols that are updating and HGNC genes with no transcript info https://github.com/openvar/rest_variantValidator/issues/186 and also handle the longer deletions in https://github.com/openvar/variantValidator/issues/651 --- VariantValidator/modules/gapped_mapping.py | 19 ++++++++++--- VariantValidator/modules/gene2transcripts.py | 30 +++++++++++++++----- VariantValidator/modules/hgvs_utils.py | 2 +- tests/test_gene2transcript.py | 7 +++++ 4 files changed, 46 insertions(+), 12 deletions(-) diff --git a/VariantValidator/modules/gapped_mapping.py b/VariantValidator/modules/gapped_mapping.py index bc969cb5..4b8863af 100644 --- a/VariantValidator/modules/gapped_mapping.py +++ b/VariantValidator/modules/gapped_mapping.py @@ -442,7 +442,7 @@ def gapped_g_to_c(self, rel_var, select_transcripts_dict): self.validator.hp, self.validator.vm, self.validator.merge_hgvs_3pr, - genomic_ac=hgvs_genomic_variant.ac,) + genomic_ac=hgvs_genomic_variant.ac) if vcf__dict['needs_a_push'] is True: needs_a_push = True @@ -688,7 +688,8 @@ def gapped_g_to_c(self, rel_var, select_transcripts_dict): hgvs_not_delins = saved_hgvs_coding self.disparity_deletion_in = ['false', 'false'] elif 'Normalization of intronic variants is not supported' in error: - # We know that this cannot be because of an intronic variant, so must be aligned to tx gap + # We know that this cannot be because of an intronic variant, so must be aligned to + # tx gap self.disparity_deletion_in = ['transcript', 'Requires Analysis'] logger.info(error) @@ -704,7 +705,6 @@ def gapped_g_to_c(self, rel_var, select_transcripts_dict): # GAP IN THE TRANSCRIPT DISPARITY DETECTED if self.disparity_deletion_in[0] == 'transcript': - # Check for issue https://github.com/openvar/variantValidator/issues/385 where the gap is # being identified but oddly the vm is not compensating, likely due to odd sequence try: @@ -724,7 +724,8 @@ def gapped_g_to_c(self, rel_var, select_transcripts_dict): # the actual length should be == gen_len_difference - 1 not == gen_len_difference if tx_len_difference - self.disparity_deletion_in[1] == gen_len_difference: # So here we know we need to knock off disparity_deletion_in[1] bases - if len(hgvs_not_delins.posedit.edit.alt) == len(self.tx_hgvs_not_delins.posedit.edit.alt): + if len(hgvs_not_delins.posedit.edit.alt) == len( + self.tx_hgvs_not_delins.posedit.edit.alt): if self.orientation == 1: self.tx_hgvs_not_delins.posedit.edit.ref = hgvs_not_delins.posedit.ref else: @@ -2815,6 +2816,7 @@ def transcript_disparity(self, reverse_normalized_hgvs_genomic, stored_hgvs_not_ if self.tx_hgvs_not_delins.posedit.pos.start.offset == 0 and \ self.tx_hgvs_not_delins.posedit.pos.end.offset == 0: + # In this instance, we have identified a transcript gap but the n. version of # the transcript variant but do not have a position which actually hits the gap, # so the variant likely spans the gap, and is not picked up by an offset. @@ -2833,6 +2835,15 @@ def transcript_disparity(self, reverse_normalized_hgvs_genomic, stored_hgvs_not_ hgvs_refreshed_variant = self.tx_hgvs_not_delins return hgvs_refreshed_variant + elif (((g3.posedit.pos.end.base - g3.posedit.pos.start.base) > + (hgvs_genomic_norm.posedit.pos.end.base - hgvs_genomic_norm.posedit.pos.start.base)) and + hgvs_genomic_norm.posedit.edit.type == 'del' and + g3.posedit.pos.start.base < hgvs_genomic_norm.posedit.pos.start.base and + (g3.posedit.pos.end.base + int(self.disparity_deletion_in[1])) < + hgvs_genomic_norm.posedit.pos.end.base): + hgvs_refreshed_variant = self.tx_hgvs_not_delins + return hgvs_refreshed_variant + g3.posedit.pos.end.base = g3.posedit.pos.start.base + (len(g3.posedit.edit.ref) - 1) try: c2 = self.validator.vm.g_to_t(g3, c1.ac, alt_aln_method=self.validator.alt_aln_method) diff --git a/VariantValidator/modules/gene2transcripts.py b/VariantValidator/modules/gene2transcripts.py index 6ab426ea..2d9d4057 100644 --- a/VariantValidator/modules/gene2transcripts.py +++ b/VariantValidator/modules/gene2transcripts.py @@ -6,6 +6,7 @@ from . import utils as fn from . import seq_data + # Pre compile variables vvhgvs.global_config.formatting.max_ref_length = 1000000 @@ -53,11 +54,15 @@ def gene2transcripts(g2t, query, validator=False, bypass_web_searches=False, sel query = g2t.db.get_stable_gene_id_from_hgnc_id(query)[1] if query == "No data": try: - query = validator.db.get_transcripts_from_annotations(store_query) - for tx in query: - if tx[5] != "unassigned": - query = tx[5] - break + query = g2t.db.get_transcripts_from_annotations(store_query) + if "none" not in query[0]: + for tx in query: + if tx[5] != "unassigned": + query = tx[5] + break + else: + return {'error': 'Unable to recognise HGNC ID. Please provide a gene symbol', + "requested_symbol": store_query} except TypeError: pass @@ -170,10 +175,21 @@ def gene2transcripts(g2t, query, validator=False, bypass_web_searches=False, sel hgnc_id = vvta_record[0][0] previous_sym = hgnc symbol_identified = True - if len(vvta_record) > 1: + elif len(vvta_record) > 1: return {'error': '%s is a previous symbol for %s genes. ' - 'Refer to https://www.genenames.org/' % (current_sym, str(len(vvta_record))), + 'Refer to https://www.genenames.org/' % (hgnc, str(len(vvta_record))), "requested_symbol": query} + else: + # Is it an updated symbol? + old_symbol = g2t.db.get_uta_symbol(hgnc) + if old_symbol is not None: + vvta_record = g2t.hdp.get_gene_info(old_symbol) + if vvta_record is not None: + current_sym = hgnc + gene_name = vvta_record[3] + hgnc_id = vvta_record[0] + previous_sym = old_symbol + symbol_identified = True if symbol_identified is False: return {'error': 'Unable to recognise gene symbol %s' % hgnc, diff --git a/VariantValidator/modules/hgvs_utils.py b/VariantValidator/modules/hgvs_utils.py index 09376419..f43b732d 100644 --- a/VariantValidator/modules/hgvs_utils.py +++ b/VariantValidator/modules/hgvs_utils.py @@ -1058,7 +1058,7 @@ def hard_right_hgvs2vcf(hgvs_genomic, primary_assembly, hn, reverse_normalizer, # Look for flank subs that may be missed when naieve mapping c > c made a delins from a sub # This is a hgvs.py quirl for flanking subs in the antisense oriemntation and refers to - # + # https://github.com/openvar/variantValidator/issues/651 try: normalized_end_seq_check_mapped = hn.normalize(end_seq_check_mapped) normalized_end_seq_check_variant = hn.normalize(end_seq_check_variant) diff --git a/tests/test_gene2transcript.py b/tests/test_gene2transcript.py index 8935fb7c..5df853c7 100644 --- a/tests/test_gene2transcript.py +++ b/tests/test_gene2transcript.py @@ -185,6 +185,13 @@ def test_symbol_valid_hgnc_id(self): results = self.vv.gene2transcripts(symbol) assert results["current_symbol"] == "COL1A1" + def test_symbol_invalid_hgnc_id(self): + symbol = 'HGNC:12029' + results = self.vv.gene2transcripts(symbol) + print(results) + assert results["error"] == "Unable to recognise HGNC ID. Please provide a gene symbol" + assert results["requested_symbol"] == symbol + def test_multiple_genes(self): symbol = '["HGNC:2197", "COL1A1", "2197"]' results = self.vv.gene2transcripts(symbol) From 45f5d07973f49705af69fb9e7b64ae90a8fd913c Mon Sep 17 00:00:00 2001 From: Peter-J-Freeman Date: Thu, 5 Dec 2024 13:36:06 +0000 Subject: [PATCH 4/7] Update the code to accept intronic variants in transcripts with alternate alignments in patches vs the primary assembly. Issue https://github.com/openvar/variantValidator/issues/657 --- VariantValidator/modules/format_converters.py | 25 +++++- VariantValidator/modules/hgvs_utils.py | 24 +++-- VariantValidator/modules/mappers.py | 61 ++++++++++--- VariantValidator/modules/use_checking.py | 22 +++-- VariantValidator/modules/vvMixinCore.py | 1 - tests/test_allele_syntax.py | 16 ++++ tests/test_alternate_alignments.py | 90 +++++++++++++++++++ 7 files changed, 211 insertions(+), 28 deletions(-) create mode 100644 tests/test_alternate_alignments.py diff --git a/VariantValidator/modules/format_converters.py b/VariantValidator/modules/format_converters.py index c9ff1e8d..b7daad5f 100644 --- a/VariantValidator/modules/format_converters.py +++ b/VariantValidator/modules/format_converters.py @@ -637,11 +637,21 @@ def intronic_converter(variant, validator, skip_check=False): # Add the edited variant for next stage error processing e.g. exon boundaries. variant.quibble = transy + # Expanding list of exceptions + try: + hgvs_transy = validator.hp.parse_hgvs_variant(transy) + except vvhgvs.exceptions.HGVSError: + # Allele syntax caught here + if "[" in transy and "]" in transy: + return genomic_ref + else: + raise + if skip_check is True: return genomic_ref else: # Check the specified base is correct - hgvs_genomic = validator.nr_vm.c_to_g(validator.hp.parse_hgvs_variant(transy), genomic_ref, + hgvs_genomic = validator.nr_vm.c_to_g(hgvs_transy, genomic_ref, alt_aln_method=validator.alt_aln_method) try: validator.vr.validate(hgvs_genomic) @@ -652,6 +662,19 @@ def intronic_converter(variant, validator, skip_check=False): else: validator.vr.validate(hgvs_genomic) + # Check re-mapping of intronic variants + if hgvs_transy.posedit.pos.start.offset != 0 or hgvs_transy.posedit.pos.end.offset != 0: + try: + check_intronic_mapping = validator.nr_vm.g_to_t(hgvs_genomic, hgvs_transy.ac, + alt_aln_method=validator.alt_aln_method) + if check_intronic_mapping.posedit.pos == hgvs_transy.posedit.pos: + pass + else: + variant.warnings.append(f'ExonBoundaryError: {hgvs_transy.posedit.pos} does not match the exon ' + f'boundaries for the alignment of {hgvs_transy.ac} to {hgvs_genomic.ac}') + except vvhgvs.exceptions.HGVSError: + pass + logger.debug("HVGS typesetting complete") diff --git a/VariantValidator/modules/hgvs_utils.py b/VariantValidator/modules/hgvs_utils.py index f43b732d..62618690 100644 --- a/VariantValidator/modules/hgvs_utils.py +++ b/VariantValidator/modules/hgvs_utils.py @@ -2277,15 +2277,21 @@ def hgvs_ref_alt(hgvs_variant, sf): ref_alt_dict = {'ref': ref, 'alt': alt} return ref_alt_dict -# -# def hgvs_to_delins(hgvs_variant_object): -# """ -# Refer to https://github.com/openvar/vv_hgvs/blob/master/examples/creating-a-variant.ipynb -# :param hgvs_variant_object: -# :return: hgvs_variant_object in raw type = "delins" state -# """ -# -# + +def incomplete_alignment_mapping_t_to_g(validator, variant): + output = None + mapping_options = validator.hdp.get_tx_mapping_options(variant.input_parses.ac) + for option in mapping_options: + if option[2] == validator.alt_aln_method and "NC_" not in option[1]: + in_assembly = seq_data.to_chr_num_refseq(option[1], variant.primary_assembly) + if in_assembly is not None: + try: + output = validator.vm.t_to_g(variant.input_parses, option[1]) + if variant.input_parses.posedit.edit.type == "identity": + output.posedit.edit.alt = output.posedit.edit.ref + except vvhgvs.exceptions.HGVSError: + pass + return output # # Copyright (C) 2016-2024 VariantValidator Contributors diff --git a/VariantValidator/modules/mappers.py b/VariantValidator/modules/mappers.py index 4b6bcf13..fe805c22 100644 --- a/VariantValidator/modules/mappers.py +++ b/VariantValidator/modules/mappers.py @@ -240,13 +240,12 @@ def transcripts_to_gene(variant, validator, select_transcripts_dict_plus_version # Se rec_var to '' so it can be updated later rec_var = '' - # First task is to get the genomic equivalent, and pri nt useful error messages if it can't be found. + # First task is to get the genomic equivalent, and print useful error messages if it can't be found. try: to_g = validator.myevm_t_to_g(obj, variant.no_norm_evm, variant.primary_assembly, variant.hn) genomic_ac = to_g.ac except vvhgvs.exceptions.HGVSDataNotAvailableError as e: errors = [] - if ('~' in str(e) and 'Alignment is incomplete' in str(e)) or "No relevant genomic mapping options" in str(e): # Unable to map the input variant onto a genomic position if '~' in str(e) and 'Alignment is incomplete' in str(e): @@ -312,7 +311,19 @@ def transcripts_to_gene(variant, validator, select_transcripts_dict_plus_version else: # Normalize was I believe to replace ref. Mapping does this anyway # to_g = variant.hn.normalize(to_g) - formatted_variant = str(validator.myevm_g_to_t(variant.evm, to_g, tx_ac)) + try: + formatted_variant = str(validator.myevm_g_to_t(variant.evm, to_g, tx_ac)) + except vvhgvs.exceptions.HGVSError as e: + if "Alignment is incomplete" in str(e): + to_g = hgvs_utils.incomplete_alignment_mapping_t_to_g(validator, variant) + if to_g is None: + error = str(e) + variant.warnings.append(error) + logger.warning(error) + return True + else: + variant.hgvc_genomic = to_g + formatted_variant = str(validator.vm.g_to_t(to_g, tx_ac)) elif ':g.' in quibble_input: if plus.search(formatted_variant) or minus.search(formatted_variant): @@ -388,7 +399,6 @@ def transcripts_to_gene(variant, validator, select_transcripts_dict_plus_version # hgvs will handle incorrect coordinates so need to automap errors # Make sure any input intronic coordinates are correct # Get the desired transcript - if cck: # This should only ever hit coding variants (RNA has been converted to c by now) if 'del' in formatted_variant: @@ -414,9 +424,20 @@ def transcripts_to_gene(variant, validator, select_transcripts_dict_plus_version try: post_var = validator.myevm_g_to_t(variant.evm, pre_var, trans_acc) except vvhgvs.exceptions.HGVSError as error: - variant.warnings.append(str(error)) - logger.warning(str(error)) - return True + if "Alignment is incomplete" in str(error): + output = hgvs_utils.incomplete_alignment_mapping_t_to_g(validator, variant) + if output is None: + error = str(error) + variant.warnings.append(error) + logger.warning(error) + return True + else: + post_var = validator.vm.g_to_t(output, tx_ac) + variant.hgvs_genomic = output + else: + variant.warnings.append(str(error)) + logger.warning(str(error)) + return True test = validator.hp.parse_hgvs_variant(quibble_input) if post_var.posedit.pos.start.base != test.posedit.pos.start.base or \ @@ -467,7 +488,20 @@ def transcripts_to_gene(variant, validator, select_transcripts_dict_plus_version variant.hn) # genome back to C coordinates - post_var = validator.myevm_g_to_t(variant.evm, pre_var, trans_acc) + try: + post_var = validator.myevm_g_to_t(variant.evm, pre_var, trans_acc) + except vvhgvs.exceptions.HGVSError as e: + if "Alignment is incomplete" in str(e): + pre_var = hgvs_utils.incomplete_alignment_mapping_t_to_g(validator, variant) + if to_g is None: + error = str(e) + variant.warnings.append(error) + logger.warning(error) + return True + else: + post_var = validator.vm.g_to_t(pre_var, tx_ac) + variant.hgvs_genomic = pre_var + test = validator.hp.parse_hgvs_variant(quibble_input) if post_var.posedit.pos.start.base != test.posedit.pos.start.base or \ post_var.posedit.pos.end.base != test.posedit.pos.end.base: @@ -590,7 +624,12 @@ def transcripts_to_gene(variant, validator, select_transcripts_dict_plus_version logger.debug("gap_compensation_1 = " + str(gap_compensation)) # Genomic sequence - hgvs_genomic = validator.myevm_t_to_g(hgvs_coding, variant.no_norm_evm, variant.primary_assembly, variant.hn) + if variant.hgvs_genomic is not None: + hgvs_genomic = variant.hgvs_genomic + else: + hgvs_genomic = validator.myevm_t_to_g(hgvs_coding, variant.no_norm_evm, variant.primary_assembly, variant.hn) + + # Create gap_mapper object instance gap_mapper = gapped_mapping.GapMapper(variant, validator) @@ -875,12 +914,12 @@ def final_tx_to_multiple_genomic(variant, validator, tx_variant, liftover_level= ori = validator.tx_exons(tx_ac=variant.hgvs_coding.ac, alt_ac=alt_chr, alt_aln_method=validator.alt_aln_method) + # Map the variant to the genome hgvs_alt_genomic = validator.myvm_t_to_g(variant.hgvs_coding, alt_chr, variant.no_norm_evm, variant.hn) - gap_mapper = gapped_mapping.GapMapper(variant, validator) - # Loop out gap code under these circumstances! if gap_compensation: + gap_mapper = gapped_mapping.GapMapper(variant, validator) hgvs_alt_genomic, hgvs_coding = gap_mapper.g_to_t_gap_compensation_version3( hgvs_alt_genomic, variant.hgvs_coding, ori, alt_chr, rec_var) variant.hgvs_coding = hgvs_coding diff --git a/VariantValidator/modules/use_checking.py b/VariantValidator/modules/use_checking.py index d4127ee1..d1cda229 100644 --- a/VariantValidator/modules/use_checking.py +++ b/VariantValidator/modules/use_checking.py @@ -5,6 +5,7 @@ import logging from . import utils as fn import copy +from . import hgvs_utils logger = logging.getLogger(__name__) @@ -25,6 +26,7 @@ def refseq_common_mistakes(variant): variant.warnings.append(error) logger.warning(error) return True + # NR_ c. if variant.transcript_type == "n" and variant.reftype == ':c.': suggestion = variant.quibble.replace(':c.', ':n.') @@ -33,6 +35,7 @@ def refseq_common_mistakes(variant): variant.warnings.append(error) logger.warning(error) return True + # NM_ n. if variant.transcript_type == "c" and variant.reftype == ':n.': suggestion = variant.quibble.replace(':n.', ':c.') @@ -74,7 +77,6 @@ def structure_checks(variant, validator): variant.input_parses = input_parses variant.gene_symbol = validator.db.get_gene_symbol_from_transcript_id(variant.input_parses.ac) - if variant.gene_symbol == 'none': variant.gene_symbol = '' if input_parses.type == 'g' or input_parses.type == 'm': @@ -188,7 +190,6 @@ def structure_checks_c(variant, validator): validator.vr.validate(variant.input_parses) except vvhgvs.exceptions.HGVSError as e: error = str(e) - if 'datums is ill-defined' in error: called_ref = variant.input_parses.posedit.edit.ref @@ -470,13 +471,22 @@ def structure_checks_c(variant, validator): variant.warnings.append(error) logger.warning(error) return True + try: variant.evm.g_to_t(output, variant.input_parses.ac) except vvhgvs.exceptions.HGVSError as e: - error = str(e) - variant.warnings.append(error) - logger.warning(error) - return True + if "Alignment is incomplete" in str(e): + output = hgvs_utils.incomplete_alignment_mapping_t_to_g(validator, variant) + if output is None: + error = str(e) + variant.warnings.append(error) + logger.warning(error) + return True + else: + error = str(e) + variant.warnings.append(error) + logger.warning(error) + return True # Check that the reference is correct by direct mapping without replacing reference check_ref_g = variant.no_replace_vm.t_to_g(variant.input_parses, output.ac, diff --git a/VariantValidator/modules/vvMixinCore.py b/VariantValidator/modules/vvMixinCore.py index bcd66ebb..426e39bd 100644 --- a/VariantValidator/modules/vvMixinCore.py +++ b/VariantValidator/modules/vvMixinCore.py @@ -1636,7 +1636,6 @@ def validate(self, for vt in variant.warnings: vt = str(vt) - # Do not warn transcript not part of build if it's not the relevant transcript if "is not part of genome build" in vt and term not in vt: continue diff --git a/tests/test_allele_syntax.py b/tests/test_allele_syntax.py index efa4bb79..81e8842e 100644 --- a/tests/test_allele_syntax.py +++ b/tests/test_allele_syntax.py @@ -93,3 +93,19 @@ def test_variant11(self): "so should be described as NM_000059.4:c.1917_1929delinsTGCATTCTTCTGT" in results["validation_warning_1"]["validation_warnings"]) +# +# Copyright (C) 2016-2024 VariantValidator Contributors +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# \ No newline at end of file diff --git a/tests/test_alternate_alignments.py b/tests/test_alternate_alignments.py new file mode 100644 index 00000000..2b71e5b8 --- /dev/null +++ b/tests/test_alternate_alignments.py @@ -0,0 +1,90 @@ +from VariantValidator import Validator +from unittest import TestCase + + +class TestAltAlignments(TestCase): + + @classmethod + def setup_class(cls): + cls.vv = Validator() + cls.vv.testing = True + + def test_sub(self): + variant = 'NW_011332691.1(NM_012234.6):c.335+1G>C' + results = self.vv.validate(variant, 'GRCh38', 'all').format_as_dict(test=True) + print(results) + assert "NM_012234.6:c.335+1G>C" in list(results.keys()) + assert results["NM_012234.6:c.335+1G>C"][ + "genome_context_intronic_sequence"] == "NW_011332691.1(NM_012234.6):c.335+1G>C" + assert results["NM_012234.6:c.335+1G>C"]["alt_genomic_loci"][0]["grch38"][ + "hgvs_genomic_description"] == "NW_011332691.1:g.53828C>G" + + def test_del(self): + variant = 'NW_011332691.1(NM_012234.6):c.335+1del' + results = self.vv.validate(variant, 'GRCh38', 'all').format_as_dict(test=True) + print(results) + assert "NM_012234.6:c.335+1del" in list(results.keys()) + assert "NW_011332691.1(NM_012234.6):c.335+1del" in results[ + "NM_012234.6:c.335+1del"]["genome_context_intronic_sequence"] + assert "NW_011332691.1:g.53828del" in results["NM_012234.6:c.335+1del"]["alt_genomic_loci"][0]["grch38"][ + "hgvs_genomic_description"] + + def test_delins(self): + variant = "NW_011332691.1(NM_012234.6):c.335+1delinsAT" + results = self.vv.validate(variant, 'GRCh38', 'all').format_as_dict(test=True) + print(results) + assert "NM_012234.6:c.335+1delinsAT" in list(results.keys()) + assert "NW_011332691.1(NM_012234.6):c.335+1delinsAT" in results[ + "NM_012234.6:c.335+1delinsAT"]["genome_context_intronic_sequence"] + assert "NW_011332691.1:g.53828delinsAT" in results["NM_012234.6:c.335+1delinsAT"]["alt_genomic_loci"][ + 0]["grch38"]["hgvs_genomic_description"] + + def test_inv(self): + variant = "NW_011332691.1(NM_012234.6):c.335+1_335+6inv" + results = self.vv.validate(variant, 'GRCh38', 'all').format_as_dict(test=True) + print(results) + assert "NM_012234.6:c.335+1_335+6inv" in list(results.keys()) + assert "NW_011332691.1(NM_012234.6):c.335+1_335+6inv" in results[ + "NM_012234.6:c.335+1_335+6inv"]["genome_context_intronic_sequence"] + assert "NW_011332691.1:g.53823_53828inv" in results["NM_012234.6:c.335+1_335+6inv"]["alt_genomic_loci"][0][ + "grch38"]["hgvs_genomic_description"] + + def test_dup(self): + variant = "NW_011332691.1(NM_012234.6):c.335+1dup" + results = self.vv.validate(variant, 'GRCh38', 'all').format_as_dict(test=True) + print(results) + assert "NM_012234.6:c.335+1dup" in list(results.keys()) + assert "NW_011332691.1(NM_012234.6):c.335+1dup" in results[ + "NM_012234.6:c.335+1dup"]["genome_context_intronic_sequence"] + assert "NW_011332691.1:g.53828dup" in results["NM_012234.6:c.335+1dup"]["alt_genomic_loci"][0]["grch38"][ + "hgvs_genomic_description"] + + def test_identity(self): + variant = "NW_011332691.1(NM_012234.6):c.335+1_335+6=" + results = self.vv.validate(variant, 'GRCh38', 'all').format_as_dict(test=True) + print(results) + assert "NM_012234.6:c.335+1_335+6=" in list(results.keys()) + assert "NW_011332691.1(NM_012234.6):c.335+1_335+6=" in results[ + "NM_012234.6:c.335+1_335+6="]["genome_context_intronic_sequence"] + assert "NW_011332691.1:g.53823_53828=" in results["NM_012234.6:c.335+1_335+6="]["alt_genomic_loci"][0]["grch38"][ + "hgvs_genomic_description"] + +# +# Copyright (C) 2016-2024 VariantValidator Contributors +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# + + + From 25701196b944b30dbf735a1adff7a935e6d26afc Mon Sep 17 00:00:00 2001 From: Peter-J-Freeman Date: Thu, 5 Dec 2024 14:30:18 +0000 Subject: [PATCH 5/7] code that deals with protein references with nucleotide variant types --- VariantValidator/modules/use_checking.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/VariantValidator/modules/use_checking.py b/VariantValidator/modules/use_checking.py index d1cda229..1441f56e 100644 --- a/VariantValidator/modules/use_checking.py +++ b/VariantValidator/modules/use_checking.py @@ -36,6 +36,16 @@ def refseq_common_mistakes(variant): logger.warning(error) return True + # NP_ c. + if ("NP_" in variant.quibble or "ENSP" in variant.quibble) and (variant.reftype == ':c.' or + variant.reftype == ':n.' or + variant.reftype == ':g.' or + variant.reftype == ':r.'): + error = f'Protein reference sequence input as Nucleotide ({variant.reftype}) variant.' + variant.warnings.append(error) + logger.warning(error) + return True + # NM_ n. if variant.transcript_type == "c" and variant.reftype == ':n.': suggestion = variant.quibble.replace(':n.', ':c.') From cf2cfe16352dd25bcf750c3b5c6e3102fa398da1 Mon Sep 17 00:00:00 2001 From: Peter-J-Freeman Date: Thu, 5 Dec 2024 15:00:41 +0000 Subject: [PATCH 6/7] Update position added for Ter= --- VariantValidator/modules/vvMixinCore.py | 15 --------------- VariantValidator/modules/vvMixinInit.py | 7 ++++--- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/VariantValidator/modules/vvMixinCore.py b/VariantValidator/modules/vvMixinCore.py index 426e39bd..885c5245 100644 --- a/VariantValidator/modules/vvMixinCore.py +++ b/VariantValidator/modules/vvMixinCore.py @@ -1349,21 +1349,6 @@ def validate(self, ) or re.search('p.?', re_parse_protein_single_aa): re_parse_protein_single_aa = re_parse_protein_single_aa.replace('p.=', 'p.(=)') - if re.search("p.\(Ter\d+=\)", predicted_protein_variant_dict['tlr']): - predicted_protein_variant_dict['tlr'] = \ - predicted_protein_variant_dict['tlr'].split("p.")[0] - predicted_protein_variant_dict['tlr'] = \ - predicted_protein_variant_dict['tlr'] + "p.(Ter=)" - re_parse_protein_single_aa = re_parse_protein_single_aa.split("p.")[0] - re_parse_protein_single_aa = re_parse_protein_single_aa + "p.(*=)" - - elif re.search("p.Ter\d+=", predicted_protein_variant_dict['tlr']): - predicted_protein_variant_dict['tlr'] = \ - predicted_protein_variant_dict['tlr'].split("p.")[0] - predicted_protein_variant_dict['tlr'] = \ - predicted_protein_variant_dict['tlr'] + "p.Ter=" - re_parse_protein_single_aa = re_parse_protein_single_aa.split("p.")[0] - re_parse_protein_single_aa = re_parse_protein_single_aa + "p.*=" # Capture instances of variation affecting p.1 if re.search("[A-Z][a-z[a-z]1[?]", predicted_protein_variant_dict['tlr']): diff --git a/VariantValidator/modules/vvMixinInit.py b/VariantValidator/modules/vvMixinInit.py index e8304a98..6a1df525 100644 --- a/VariantValidator/modules/vvMixinInit.py +++ b/VariantValidator/modules/vvMixinInit.py @@ -251,9 +251,10 @@ def myc_to_p(self, hgvs_transcript, evm, re_to_p, hn): if hgvs_transcript.type == 'c': # Handle non inversions with simple c_to_p mapping - if (hgvs_transcript.posedit.edit.type != 'inv') and (hgvs_transcript.posedit.edit.type != 'dup') and \ - (hgvs_transcript.posedit.edit.type != 'delins') and (hgvs_transcript.posedit.edit.type != 'sub') and (hgvs_transcript.posedit.edit.type != 'identity') \ - and (re_to_p is False): + if ((hgvs_transcript.posedit.edit.type != 'inv') and (hgvs_transcript.posedit.edit.type != 'dup') and + (hgvs_transcript.posedit.edit.type != 'delins') and (hgvs_transcript.posedit.edit.type != 'sub') + and (hgvs_transcript.posedit.edit.type != 'identity') + and (re_to_p is False)): hgvs_protein = None # Does the edit affect the start codon? if ((1 <= hgvs_transcript.posedit.pos.start.base <= 3 and hgvs_transcript.posedit.pos.start.offset == 0) From 36d42378d8ee8b62d2154d9f842f6722c230e8cd Mon Sep 17 00:00:00 2001 From: Peter-J-Freeman Date: Wed, 11 Dec 2024 11:29:51 +0000 Subject: [PATCH 7/7] Update vdb version and test issue 87 --- db_dockerfiles/vdb/Dockerfile | 2 +- tests/test_inputs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/db_dockerfiles/vdb/Dockerfile b/db_dockerfiles/vdb/Dockerfile index 763b2293..c01d08d2 100644 --- a/db_dockerfiles/vdb/Dockerfile +++ b/db_dockerfiles/vdb/Dockerfile @@ -15,6 +15,6 @@ RUN rm -rf /var/lib/apt/lists/* RUN echo '[mysqld]' >> /etc/mysql/my.cnf && \ echo 'max_connections=250' >> /etc/mysql/my.cnf -RUN wget https://www528.lamp.le.ac.uk/vvdata/validator/validator_2024_09.sql.gz -O /docker-entrypoint-initdb.d/validator_2024_09.sql.gz +RUN wget https://www528.lamp.le.ac.uk/vvdata/validator/validator_2024_12.sql.gz -O /docker-entrypoint-initdb.d/validator_2024_12.sql.gz CMD ["mysqld"] diff --git a/tests/test_inputs.py b/tests/test_inputs.py index 1558486a..08051986 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -30477,7 +30477,7 @@ def test_issue_87(self): variant = 'NC_000009.11:g.130577961C>T' results = self.vv.validate(variant, 'GRCh37', 'NM_001278138.1').format_as_dict(test=True) print(results) - assert '*=' in results['NM_001278138.1:c.1431G>A']['hgvs_predicted_protein_consequence']['slr'] + assert '*477=' in results['NM_001278138.1:c.1431G>A']['hgvs_predicted_protein_consequence']['slr'] def test_issue_185(self): variant = 'NC_000023.10(NM_004006.2):c.6615_7660del'