-
Notifications
You must be signed in to change notification settings - Fork 1
/
taxonomic_density.py
184 lines (153 loc) · 7.19 KB
/
taxonomic_density.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os, sys
from phylogeny_utilities.utilities import *
from collections import Counter
import numpy as np
import scipy.stats
#from score import *
# fa=read_from_fasta('pasta.fasta')
# owners=[k for k in fa.keys() if fa[k] == dupe_seqs[0]]
taxrank_colsdict = {
'superkingdom': 2,
'phylum': 7,
'class': 11,
'order': 16,
'family': 20,
'genus': 26,
'species': 32
}
taxrank=[2, 7, 11, 16, 20, 26, 32]
cols=['root', 'root_', 'superkingdom', 'superkingdom_', 'superkingdom__', 'superkingdom___', 'superkingdom____',
'phylum', 'phylum_', 'phylum__', 'subphylum', 'class', 'class_', 'class__', 'class___', 'subclass', 'order',
'order_', 'order__', 'suborder', 'family', 'family_', 'family__', 'family___', 'subfamily', 'tribe', 'genus',
'genus_', 'subgenus', 'species_group', 'species_subgroup', 'species_subgroup_', 'species', 'species_',
'species__', 'species___', 'subspecies', 'subspecies_', 'subspecies__', 'subspecies___']
rank_cols=dict(zip(cols,range(1,len(cols)+1)))
def get_taxonomy_table(folder=None):
# tax_f = open('/projects/tallis/nute/data/ncbi-2017/taxonomy_folder/taxonomy.table', 'r')
# tax_f = open('/projects/tallis/nute/data/ncbi-2017/taxonomy_folder/taxonomy.table.with_GT', 'r')
if folder is None:
tax_f = open('taxonomy.table', 'r')
else:
tax_f = open(os.path.join(folder,'taxonomy.table'), 'r')
tax_f.readline()
taxa_dict = {}
for ln in tax_f:
a = ln.strip()[1:-1].split('\",\"')
taxa_dict[int(a[0])] = a[1:]
tax_f.close()
return taxa_dict
def fill_parent_vector(mytaxid, tax_dict=None, specmap=None):
if tax_dict is None:
tax_dict = get_taxonomy_table()
taxid = mytaxid
new_taxid = None
new_lev = None
parent_vector = [-1, ] * 40
lev = rank_cols[tax_dict[taxid][1]]
parent_vector[lev - 1] = taxid
while new_taxid != 1:
new_taxid = int(tax_dict[taxid][0]) #also sharp
new_lev = rank_cols[tax_dict[new_taxid][1]] # the level, i.e. a number
# fill in all the levels until new_lev
assert new_lev < lev, 'somehow the rank went down'
for i in range(lev - new_lev):
parent_vector[lev - i - 1] = taxid
parent_vector[new_lev - 1] = new_taxid
taxid = new_taxid
lev = new_lev
return parent_vector
def get_taxonomic_diversity_for_sequence(sq, fa, line_lead = '', tax_tab=None, specmap=None):
'''
Finds the distribution and calculates summary statistics of the set of taxa in a fasta
that shares a sequence.
:param sq: (string). a seqeunce whose detailed lineage are looking for.
:param fa: a dictionary object representing a FASTA file.
:return: Returns the percentage of the sequences taht belong to the most dominant taxon id,
for each of the 7 major taxonomic levels. Also returns some basic metadata.
'''
# find the taxa that share this sequence:
owners = [k for k in fa.keys() if fa[k]==sq]
taxa_np = np.ones((len(owners),40),dtype=np.int32)*-1
num_owners = len(owners)
# debugging
# print(str(num_owners))
# print("length of owners vector: %s" % len(owners))
# print(str(owners))
for i in range(len(owners)):
taxa_np[i,:] = fill_parent_vector(int(specmap[owners[i]]), tax_tab,specmap)
print(line_lead + '...getting taxon info for genome %d of %d' % (i, num_owners),end='\r',flush=True)
# if verbose and i > cut:
# print('finished with %s owners out of %s' % (cut,len(owners)))
# cut += cut_gap
print(line_lead + '...DONE... ', end='\r', flush=True)
ranks = ['superkingdom','phylum','class','order','family','genus','species']
ranks_2char = {'superkingdom':'sk','phylum':'ph','class':'cl','order':'or','family':'fa','genus':'ge','species':'sp'}
dominant_taxa = dict.fromkeys(ranks,None)
for k in ranks:
col=taxrank_colsdict[k]
md = scipy.stats.mode(taxa_np[:,col])
dominant_taxa[k] = (md[0][0], md[1][0], md[1][0] / num_owners) # 3-tuple with (value, count, % of column)
dominant_taxa[ranks_2char[k]+'_str'] = ranks_2char[k] + ':(%d, %d, %.2f)' % dominant_taxa[k]
dominant_taxa['rep_seq_name'] = owners[0]
dominant_taxa['num_owners'] = num_owners
return dominant_taxa
def review_fasta_file_with_duplicates(fasta_path, debug = False, initial_only = False):
'''
Basically what we want to know is if a a big group of identical sequences are in here, do they belong to the
same (basic) organism.
:param fasta_path:
:return:
'''
print('...opening file and gathering initial data...',end='\r',flush=True)
fp_full = os.path.abspath(fasta_path)
#debugging
# print('\t\tfp_full: %s' % fp_full)
fp_fold, fp_file = os.path.split(fp_full)
# print('\t\tfp_fold: %s' % fp_fold)
# print('\t\tfp_file: %s' % fp_file)
assert ('species.mapping' in os.listdir(fp_fold)), "The location of the alignment must also have the species mapping"
specmap = get_dict_from_file(os.path.join(fp_fold,'species.mapping'), delimiter=",")
# print('specmap has length: %s' % len(specmap))
tax_tab = get_taxonomy_table(fp_fold)
fa=read_from_fasta(fasta_path)
c=Counter(list(fa.values()))
dupe_seqs = [k for k in c.keys() if c[k]>1]
metadata= {
'full_fasta_path': fp_full,
'num_seqs':len(list(fa.keys())),
'num_cols':len(list(fa.values())[0]),
'num_unique_seqs':len(c),
'num_dupe_seqs':len(dupe_seqs)
}
# Sort the dupe list so largest dupes are first:
dupe_seqs.sort(reverse=True, key=lambda x: c[x])
dupe_cts = list(map(lambda x: c[x], dupe_seqs))
print(' ', end='\r', flush=True)
cutoff = 50 # if a sequence doesn't show up at least this many times, we're not
# going to worry about whether multiple taxa show up.
num_seqs_to_check = sum([1 for k in dupe_cts if k > cutoff])
results = {}
print('Fasta File: %s' % fp_full)
print(' -# seqs: %5d, -# cols: %s' % (metadata['num_seqs'], metadata['num_cols']))
print(' -# unique seqs: %5d, -# duplicates: %s' % ( metadata['num_unique_seqs'], metadata['num_dupe_seqs']))
print(' -cutoff: %s copies -# over cutoff: %s' % (cutoff, num_seqs_to_check))
if initial_only is True:
print('\t(option for initial information only was given, stopping here)')
sys.exit(0)
print('\n')
print(' ###) #copies:, dominant taxon info (Tax ID, Freq (ct), % of Tot)...')
#'%4d) %5d cp,'
for i in range(num_seqs_to_check):
# print(' ###) #copies:,
ln_lead = '%4d) %5d cp,' % (i,dupe_cts[i])
dt = get_taxonomic_diversity_for_sequence(dupe_seqs[i], fa, ln_lead, tax_tab, specmap)
tax_str = ', '.join([dt['ph_str'], dt['cl_str'], dt['or_str'], dt['fa_str'], dt['ge_str'], dt['sp_str']])
print(ln_lead + ' ' + tax_str)
if debug and i >=1:
break
if __name__=='__main__':
fp = sys.argv[1]
if '-i' in sys.argv or '--initial_only' in sys.argv:
review_fasta_file_with_duplicates(fp, False, True)
sys.exit(0)
review_fasta_file_with_duplicates(fp,False)