forked from genome-nexus/annotation-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvariant_notation_converter.py
147 lines (129 loc) · 5.98 KB
/
variant_notation_converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
import re
VALID_SPECIAL_CHROMOSOMES = {"X", "Y"} # MT chromosome references not handled
MINIMAL_GENOMIC_OFFSET = 1
SPECIES_TO_SEX_CHROMOSOME_OFFSET = {"Homo sapiens" : "23", "Mus musculus" : "20", "Rattus norvegicus" : "21", "Bos taurus" : "30", "Danio rerio" : None, "Caenorhabditis elegans" : "6", "Saccharomyces cerevisiae" : None}
NUCLEOTIDE_SEQUENCE_RE_PATTERN = "^([ATGC]+)$"
def is_string(value):
if value == None:
return False
return isinstance(value, str)
def is_positive_integer(string_value):
if not is_string(string_value):
return False
try:
i = int(string_value)
return i > 0
except ValueError:
return False
def offsets_are_valid(start, end):
if not is_string(start) or not is_string(end):
return False
try:
s = int(start)
e = int(end)
return s >= MINIMAL_GENOMIC_OFFSET and e >= MINIMAL_GENOMIC_OFFSET and s <= e
except ValueError:
return False
def chromosome_is_valid(chromosome):
if not is_string(chromosome):
return False
trimmed_chromosome = chromosome.strip()
if trimmed_chromosome in VALID_SPECIAL_CHROMOSOMES:
return True
return is_positive_integer(trimmed_chromosome)
def allele_is_valid(allele):
return is_string(allele) # note : any string (even empty) might be acceptable in one of the provided tumor seq allele positions
def allele_is_empty(allele):
stripped_allele = allele.strip()
return stripped_allele == "NA" or stripped_allele == ""
def genomic_variant_has_proper_format(genomic_variant):
if genomic_variant == None:
return False
if not isinstance(genomic_variant, list):
return False
if not len(genomic_variant) == 6:
return False
[ chromosome, start, end, ref_allele, seq_allele1, seq_allele2 ] = genomic_variant
if not chromosome_is_valid(chromosome):
return False
if not offsets_are_valid(start, end):
return False
if not allele_is_valid(ref_allele) or not allele_is_valid(seq_allele1) or not allele_is_valid(seq_allele2):
return False
if allele_is_empty(seq_allele1) and allele_is_empty(seq_allele2):
return False
return True
def species_code_is_valid(species_code):
if not is_string(species_code):
return False
return species_code in SPECIES_TO_SEX_CHROMOSOME_OFFSET
def increment_integer_string(string_value):
return str(int(string_value) + 1)
def normalize_chromosome(chromosome, sex_chromosome_offset):
normalized_chromosome = chromosome.strip()
if normalized_chromosome[0:3] == "chr":
normalized_chromosome = normalized_chromosome[3:]
if sex_chromosome_offset == None:
return normalized_chromosome
x_chromosome_offset = sex_chromosome_offset
if normalized_chromosome == x_chromosome_offset:
return "X"
y_chromosome_offset = increment_integer_string(x_chromosome_offset)
if normalized_chromosome == y_chromosome_offset:
return "Y"
return normalized_chromosome
def normalize_offset(offset):
return offset.strip()
def normalize_allele(allele):
return allele.strip().upper()
def normalize_genomic_variant(genomic_variant, sex_chromosome_offset):
[ chromosome, start, end, ref_allele, seq_allele1, seq_allele2 ] = genomic_variant
normalized_genomic_variant = []
normalized_genomic_variant.append(normalize_chromosome(chromosome, sex_chromosome_offset))
normalized_genomic_variant.append(normalize_offset(start))
normalized_genomic_variant.append(normalize_offset(end))
normalized_genomic_variant.append(normalize_allele(ref_allele))
normalized_genomic_variant.append(normalize_allele(seq_allele1))
normalized_genomic_variant.append(normalize_allele(seq_allele2))
return normalized_genomic_variant
def seq_allele_ambiguity_exists(ref_allele, seq_allele1, seq_allele2):
if allele_is_empty(seq_allele1) or seq_allele1 == ref_allele:
return False
if allele_is_empty(seq_allele2) or seq_allele2 == ref_allele:
return False
if seq_allele1 == "-" and re.match(NUCLEOTIDE_SEQUENCE_RE_PATTERN, seq_allele2):
return True
if seq_allele2 == "-" and re.match(NUCLEOTIDE_SEQUENCE_RE_PATTERN, seq_allele1):
return True
return False
def select_variant_allele(ref_allele, seq_allele1, seq_allele2):
if allele_is_empty(seq_allele1) or seq_allele1 == ref_allele:
return seq_allele2
if allele_is_empty(seq_allele2) or seq_allele2 == ref_allele:
return seq_allele1
if seq_allele_ambiguity_exists(ref_allele, seq_allele1, seq_allele2):
if seq_allele2 == "-":
return seq_allele1
return seq_allele2
return seq_allele1
def genomic_variant_to_hgvs(genomic_variant, species_name):
if not genomic_variant_has_proper_format(genomic_variant):
return None
if not species_code_is_valid(species_name):
return None
# normalize genomic location
chromosome_offset = SPECIES_TO_SEX_CHROMOSOME_OFFSET[species_name];
normalized_genomic_variant = normalize_genomic_variant(genomic_variant, chromosome_offset)
[ chromosome, start, end, ref_allele, seq_allele1, seq_allele2 ] = normalized_genomic_variant
# detect type and do conversoin
selected_variant_allele = select_variant_allele(ref_allele, seq_allele1, seq_allele2)
if ref_allele == "-" or len(ref_allele) == 0 or ref_allele == "NA" or ref_allele.find("--") != -1:
return chromosome + ":g." + start + "_" + increment_integer_string(start) + "ins" + selected_variant_allele
if selected_variant_allele == '-' or len(selected_variant_allele) == 0:
return chromosome + ":g." + start + "_" + end + "del" + ref_allele;
if len(ref_allele) > 1 or len(selected_variant_allele) > 1:
return chromosome + ":g." + start + "_" + end + "del" + ref_allele + "ins" + selected_variant_allele;
else:
return chromosome + ":g." + start + ref_allele + ">" + selected_variant_allele;
print("not handled yet : received " + str(genomic_variant))