-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqueryExon.py
163 lines (128 loc) · 7.77 KB
/
queryExon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import sys, os
import utils
class QueryExonSeqs:
def __init__(self, ref_species, exons_list, chromosome, ref_strand, two_bit_dir, query_species_list, maf_index, verbosity):
self.ref_species = ref_species
self.exons_list = exons_list
self.chromosome = chromosome
self.ref_strand = ref_strand
self.two_bit_dir = two_bit_dir
self.query_species_list = query_species_list
self.maf_index = maf_index
self.verbosity = verbosity
def get_query_seqs(self): ## this is the wrapper function that contains calls to different functions which do the actual work
temp_maf = utils.create_temp_devshm()
temp_maf_filtered = utils.create_temp_devshm()
start_dict, stop_dict, chr_dict, strand_dict = dict(), dict(), dict(), dict()
exon_indices_list = list(range(0, len(self.exons_list)))
for species in self.query_species_list:
start_dict[species] = {exon_index:None for exon_index in exon_indices_list}
stop_dict[species] = {exon_index:None for exon_index in exon_indices_list}
chr_dict[species] = {exon_index:None for exon_index in exon_indices_list}
strand_dict[species] = {exon_index:None for exon_index in exon_indices_list}
if self.verbosity > 1:
print(f'Species under consideration are {self.query_species_list}')
for exon_index, exon in enumerate(self.exons_list):
if self.verbosity > 1:
print(f'The following exon is under consideration: {self.chromosome}-{exon}')
utils.run_maf_extract(self.maf_index, temp_maf, self.chromosome, exon)
utils.filter_maf_file(temp_maf, temp_maf_filtered, self.query_species_list)
cds_start_query, cds_stop_query, strand_dict_query, chr_dict_query = self.__get_coordinates_from_maf(temp_maf_filtered)
os.remove(temp_maf)
os.remove(temp_maf_filtered)
for species in self.query_species_list:
if species in cds_start_query:
start_dict[species][exon_index] = cds_start_query[species]
stop_dict[species][exon_index] = cds_stop_query[species]
chr_dict[species][exon_index] = chr_dict_query[species]
strand_dict[species][exon_index] = strand_dict_query[species]
## now check if all the exons for a transcript are colinear. If not then exclude these
## colinear means - one strand, one scaffold/chromosome. all the start_cds should be linearly arranged. all the stop_cds should be linearly arranged
query_species_cds = dict()
valid_species = 0
species_ignore = list()
for species in self.query_species_list:
cds_starts = list(start_dict[species].values())
cds_stops = list(stop_dict[species].values())
strand_set = set(strand_dict[species].values())
chromosome_set = set(chr_dict[species].values())
## allow for those cases where one of the exons in the middle has a None value for cds_starts/cds_stop but do not allow for the first or the last exon to have a None value
if cds_starts[0] == None or cds_starts[-1] == None:
species_ignore.append(species)
if self.verbosity > 1:
print(f'Omitting species {species} because the cds_starts have a None value at the beginning/end. Here is the list {cds_starts}')
continue
########################################################################
## Remove None values from these lists
if None in strand_set:
chromosome_set.remove(None)
strand_set.remove(None)
cds_starts = [start for start in cds_starts if start != None]
cds_stops = [stop for stop in cds_stops if stop != None]
########################################################################
if len(strand_set) == 1 and len(chromosome_set) == 1:
strand = list(strand_set)[0]
chromosome = list(chromosome_set)[0]
if strand == self.ref_strand:
if cds_starts == sorted(cds_starts) and cds_stops == sorted(cds_stops):
query_species_cds[species] = f'{species}#{cds_starts[0]}#{cds_stops[-1]}#{strand}#{chromosome}'
valid_species += 1
else:
if cds_starts == sorted(cds_starts)[::-1] and cds_stops == sorted(cds_stops)[::-1]:
query_species_cds[species] = f'{species}#{cds_stops[-1]}#{cds_starts[0]}#{strand}#{chromosome}'
valid_species += 1
return query_species_cds, valid_species
def __get_coordinates_from_maf(self, input_maf):
ref_species = self.ref_species
species_list = self.query_species_list
with open(input_maf, 'r') as FI:
maf_extract_result = FI.readlines()
maf_extract_result = [line.rstrip('\n') for line in maf_extract_result]
cds_list_dict = dict()
chr_tracker_dict, strand_tracker_dict = dict(), dict()
for species in species_list:
cds_list_dict[species] = list()
chr_tracker_dict[species] = set()
strand_tracker_dict[species] = set()
for line in maf_extract_result:
if line.startswith('s'):
species, chromosome, start, stop, strand, src_size, seq = utils.get_sline_data(line)
if species == ref_species:
continue
cds_start_query = cds_stop_query = '', ''
if strand == '-':
cds_start_query = src_size - start - stop
cds_stop_query = src_size - start
else:
cds_start_query = start
cds_stop_query = start + stop
cds_string = f'{cds_start_query}-{cds_stop_query}'
cds_list_dict[species].append(cds_string)
chr_tracker_dict[species].add(chromosome)
strand_tracker_dict[species].add(strand)
## Now identify those species for which the aligning sequence comes from more than one scaffold/chromosome or strand
species_exclude = [species for species in species_list if len(strand_tracker_dict[species]) != 1 or len(chr_tracker_dict[species]) != 1]
species_list = [species for species in species_list if species not in species_exclude] ## and update the species_list
strand_query_dict = {species:list(element)[0] for species, element in strand_tracker_dict.items() if len(element) == 1}
chr_query_dict = {species:list(element)[0] for species, element in chr_tracker_dict.items() if len(element) == 1}
cds_start_query_dict, cds_stop_query_dict = dict(), dict()
for species in species_list:
cds_list = cds_list_dict[species]
if len(cds_list) > 0:
if strand_query_dict[species] == '-':
cds_list = cds_list[::-1]
first_element = cds_list[0]
last_element = cds_list[-1]
cds_start_query_dict[species], _ = first_element.split('-')
_, cds_stop_query_dict[species] = last_element.split('-')
else:
cds_start_query_dict[species] = None
cds_stop_query_dict[species] = None
if self.verbosity > 1:
print("The __get_coordinates_from_maf function returns the following values:")
print(cds_start_query_dict)
print(cds_stop_query_dict)
print(strand_query_dict)
print(chr_query_dict)
print("\n")
return cds_start_query_dict, cds_stop_query_dict, strand_query_dict, chr_query_dict