-
Notifications
You must be signed in to change notification settings - Fork 2
/
TIP_finder_utils.py
283 lines (249 loc) · 11.7 KB
/
TIP_finder_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
"""
TIP_finder Utils: Transposon Polymorphic Insertion finder Utilities
© Copyright
Developed by Simon Orozco Arias
email: [email protected]
2020
"""
#!/bin/env python3
import sys
import os
import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import argparse
def createFinalMatrix(te, directory, outputDir, map_th):
if os.path.exists(outputDir):
print("WARNING: output directory: "+outputDir+" already exist, be careful")
else:
os.mkdir(outputDir)
##############################################################
### creating final matrix
sample_names = []
ins_samples = {}
file_number = 0
for covf in os.listdir(directory):
if "coveragebed_" in covf and "-vs-"+te in covf:
file_number += 1
lines = open(directory+"/"+covf, 'r').readlines()
sample_name = covf.replace("coveragebed_", "").replace("-vs-"+te, "").replace("_per10kb.bed", "")
sample_names.append(sample_name)
for line in lines:
columns = line.split('\t')
if int(columns[3]) >= map_th:
new_ins = columns[0]+"_"+columns[1]+"_"+columns[2]
if new_ins in ins_samples.keys():
ins_samples[new_ins].append(sample_name)
else:
ins_samples[new_ins] = [sample_name]
final_matrix = pd.DataFrame(0, index=range(len(ins_samples.keys())), columns=range(len(sample_names)))
final_matrix.columns = sample_names
final_matrix.index = list(ins_samples.keys())
for k in ins_samples.keys():
for s in ins_samples[k]:
final_matrix.loc[k, s] = 1
final_matrix.to_csv(outputDir+'/final_matrix_'+te+'.csv')
def histograms(outputDir, matrixPath):
final_matrix = pd.read_csv(matrixPath, index_col = 0)
final_matrix['sum'] = final_matrix.sum(axis=1)
### plotting frequencies in log2 per each insertion site
print("plotting frequencies in log base 2 per each insertion site")
plt.figure(figsize=(9, 8))
plt.xlabel('Frequencies (log)')
sns.distplot([math.log(x,2) for x in final_matrix['sum']], color='b', hist_kws={'alpha': 0.4}, rug=True);
plt.savefig(outputDir+'/insertion_frequencies_log.png', dpi=300)
### plotting frequencies per each insertion site
print("plotting frequencies per each insertion site")
final_matrix = final_matrix.drop(columns="sum")
final_matrix.loc['samplesSum'] = final_matrix.sum()
plt.figure(figsize=(9, 8))
sns.distplot(final_matrix.loc['samplesSum'], color='b', hist_kws={'alpha': 0.4}, rug=True);
plt.xlabel('Frequencies')
plt.savefig(outputDir+'/inserts_per_samples.png', dpi=300)
def countPerChrs(matrixPathCase1, matrixPathCase2, outputDir, graph):
final_matrixcase1 = pd.read_csv(matrixPathCase1, index_col = 0)
final_matrixcase2 = pd.read_csv(matrixPathCase2, index_col = 0)
totalIndividuals = len(final_matrixcase1.columns) + len(final_matrixcase2.columns)
### sum TIPs per case
final_matrixcase1['sum'] = final_matrixcase1.sum(axis=1)
final_matrixcase2['sum'] = final_matrixcase2.sum(axis=1)
insertion_sites = list(final_matrixcase1.index.values)
insertion_sites2 = list(final_matrixcase2.index.values)
for ins in insertion_sites2:
if ins not in insertion_sites:
insertion_sites.append(ins)
final_count = pd.DataFrame(0, index=range(len(insertion_sites)), columns=range(2))
final_count.columns = ["Case 1", "Case 2"]
final_count.index = insertion_sites
for ins in insertion_sites:
contCase1 = 0
contCase2 = 0
if ins in final_matrixcase1.index:
contCase1 = final_matrixcase1.loc[ins, 'sum']
if ins in final_matrixcase2.index:
contCase2 = final_matrixcase2.loc[ins, 'sum']
final_count.loc[ins, "Case 1"] = contCase1
final_count.loc[ins, "Case 2"] = contCase2
if graph == True:
print("plotting frequencies of each case")
plt.figure(figsize=(9, 8))
sns.distplot(final_count['Case 1'], hist=False, rug=True, color='b');
sns.distplot(final_count['Case 2'], hist=False, rug=True, color='r');
plt.xlabel('Frequencies')
plt.savefig(outputDir+'/inserts_per_case.png', dpi=300)
final_count.to_csv(outputDir+'/TIPscount_cases.csv')
return totalIndividuals - 2
def countPerWindow(filetips, windows):
final_count = pd.read_csv(filetips, index_col = 0)
### to find the biggest position of a TIP
maximum = max([int(x.split("_")[2]) for x in list(final_count.index.values)])
max([int(x.split("_")[2]) for x in list(final_count.index.values)])
#print(maximum)
### to extract uniq id of each chromosome
chr_ids = list(dict.fromkeys([x.split("_")[0] for x in list(final_count.index.values)]))
chrs_case1 = {}
chrs_case2 = {}
maxNumber = 0
for ch in chr_ids:
case1 = [0 for x in range(0, int(maximum), int(windows))]
case2 = [0 for x in range(0, int(maximum), int(windows))]
for ins in list(final_count.index.values):
if ch+"_" in ins:
end = int(ins.split('_')[2])
interval = int(end/windows)
#print("interval "+str(interval)+", total Length: "+str(len(case1))+", insertion: "+ins)
case1[interval] += int(final_count.loc[ins, "Case 1"])
case2[interval] += int(final_count.loc[ins, "Case 2"])
chrs_case1[ch] = case1
chrs_case2[ch] = case2
ymax1 = max(case1)
ymax2 = max(case2)
ymax = max(ymax1,ymax2)
if ymax >= maxNumber:
maxNumber = ymax
for ch in chr_ids:
plt.figure(figsize=(9, 8))
plt.plot([x for x in range(0, maximum, windows)], chrs_case1[ch], 'b', [x for x in range(0, maximum, windows)], chrs_case2[ch], 'r')
plt.xlabel('Chromosome length')
axes = plt.gca()
axes.set_ylim([0, maxNumber])
plt.savefig(outputDir+'/chr_'+ch+'.png', dpi=300)
plt.close()
def associationTest(filetips, individuals, prob):
final_count = pd.read_csv(filetips, index_col = 0)
assocations = pd.DataFrame(0, index=range(len(list(final_count.index))), columns=range(4))
assocations.columns = ["Case 1", "Case 2", "P-value", "Association"]
assocations.index = list(final_count.index.values)
for ins in list(final_count.index.values):
cases1Pos = int(final_count.loc[ins, "Case 1"])
cases1Neg = individuals - int(final_count.loc[ins, "Case 1"])
cases2Pos = int(final_count.loc[ins, "Case 2"])
cases2Neg = individuals - int(final_count.loc[ins, "Case 2"])
# contingency table
table = [[cases1Pos,cases2Pos], [cases1Neg, cases2Neg]]
stat, p, dof, expected = chi2_contingency(table)
critical = chi2.ppf(prob, dof)
# interpret p-value
alpha = 1.0 - prob
if p <= alpha:
measureAsoc = math.sqrt(float(stat)/(float(stat) + individuals))
assocations.loc[ins, "Case 1"] = cases1Pos
assocations.loc[ins, "Case 2"] = cases2Pos
assocations.loc[ins, "P-value"] = p
assocations.loc[ins, "Association"] = measureAsoc
### remove TIPs with no statistical assocation
assocations.drop(assocations[assocations.Association == 0].index, inplace=True)
print("TIPs with statistical assocation: "+str(len(assocations)))
assocations.to_csv(outputDir+'/TIPS_with_association.csv')
if __name__ == '__main__':
print("\n#########################################################################")
print("# #")
print("# TIP_finder Utils: Transposable Element Insertion Polymorphisms finder #")
print("# utilities #")
print("#########################################################################\n")
### read parameters
parser = argparse.ArgumentParser()
parser.add_argument('-u','--util',required=True,dest='util',help='Utility to be used, must be: finalMatrix or histograms or peaks or association')
parser.add_argument('-t','--te',dest='te',help='TE family name (used in finalMatrix util)')
parser.add_argument('-o','--output-dir',required=True,dest='outputDir',help='Path of the output directory')
parser.add_argument('-d','--directory',dest='directory',help='Directory which contains coveraged files. (used in finalMatrix util).')
parser.add_argument('-m','--map-thr',dest='map_th',help='Minimum number of maps. Default 5. (used in finalMatrix util).')
parser.add_argument('-f','--matrix-path',dest='matrixPath',help='Path to the final matrix. (used in histograms util).')
parser.add_argument('-1','--case1-matrix',dest='matrixPathCase1',help='Path to the final matrix of case 1. (used in peaks and association utilities).')
parser.add_argument('-2','--case2-matrix',dest='matrixPathCase2',help='Path to the final matrix of case 2. (used in peaks and association utilities).')
parser.add_argument('-w','--windows',type=int,dest='windows',help='Window length to graph TIPs. Dafault 1000000. (used in peaks util)')
parser.add_argument('-n','--confidence-level',type=float,dest='prob',help='confidence level used in association tests. Dafault 0.95. (used in association util)')
parser.add_argument('-v','--version',action='version', version='%(prog)s v1.0')
options = parser.parse_args()
util = options.util
te = options.te
outputDir = options.outputDir
directory = options.directory
map_th = options.map_th
matrixPath = options.matrixPath
matrixPathCase1 = options.matrixPathCase1
matrixPathCase2 = options.matrixPathCase2
prob = options.prob
windows = options.windows
if util == None:
print('FATAL ERROR: Missing util parameter (-u or --util). Exiting')
sys.exit(0)
if outputDir == None:
print('FATAL ERROR: Missing output directory (-o or --output-dir). Exiting')
sys.exit(0)
########################################################
### execution of utilities
if util == "finalMatrix":
if directory == None:
print('FATAL ERROR: Missing input directory (-d or --directory). Exiting')
sys.exit(0)
if te == None:
print('FATAL ERROR: Missing name of the TE family (-t or --te). Exiting')
sys.exit(0)
if map_th == None:
print('WARNING: Missing minimum number of maps (-m or --map-thr). Using by default 5')
map_th = 5
createFinalMatrix(te, directory, outputDir, int(map_th))
elif util == "histograms":
if matrixPath == None:
print('FATAL ERROR: Missing path to the final matrix (-f or --matrix-path). Exiting')
sys.exit(0)
histograms(outputDir, matrixPath)
elif util == "peaks":
if matrixPathCase1 == None:
print('FATAL ERROR: Missing path to the matrix of individuals from case 1 (-1 or --case1-matrix). Exiting')
sys.exit(0)
if matrixPathCase2 == None:
print('FATAL ERROR: Missing path to the matrix of individuals from case 2 (-2 or --case2-matrix). Exiting')
sys.exit(0)
if windows == None:
print('WARNING: Missing window length to be used in graphs (-w or --windows). Using by default 1000000')
windows = 1000000
### to count number of TIPs by cases
individuals = countPerChrs(matrixPathCase1, matrixPathCase2, outputDir, 1) # graph 1 indicates create graphs, otherwise indicates do not generate graphs.
print(str(individuals)+" individuals processed")
### using file created by countPerChrs, count number of TIPs per a given window size
countPerWindow(outputDir+"/TIPscount_cases.csv", int(windows))
elif util == "association":
if matrixPathCase1 == None:
print('FATAL ERROR: Missing path to the matrix of individuals from case 1 (-1 or --case1-matrix). Exiting')
sys.exit(0)
if matrixPathCase2 == None:
print('FATAL ERROR: Missing path to the matrix of individuals from case 2 (-2 or --case2-matrix). Exiting')
sys.exit(0)
if prob == None:
print('WARNING: Missing confidence level (-n or --confidence-level). Using by default 0.95')
prob = 0.95
elif prob > 1:
print('FATAL ERROR: confidence level greater than 1, please especified it again (as an example 0.95)')
sys.exit(0)
### to count number of TIPs by cases
individuals = countPerChrs(matrixPathCase1, matrixPathCase2, outputDir, 0)
print(str(individuals)+" individuals processed")
associationTest(outputDir+"/TIPscount_cases.csv", individuals, float(prob))
else:
print("Util "+util+" did not found")
print("Util must be: finalMatrix or histograms or peaks or association")