-
Notifications
You must be signed in to change notification settings - Fork 1
/
catapep.py
executable file
·469 lines (380 loc) · 15.6 KB
/
catapep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
import pdb
import sys
import argparse
import stats
import aljamia
import shutil
import os
import sqlite3
from time import strftime
# *-*-* add further libraries used,
# *-*-* do not forget to include them to klibrate.py when creating the executables
import pprint
pp = pprint.PrettyPrinter(indent=4) # important for looking at lists while pdb-ing
#######################################################
def easterEgg():
print u"""
Diéronles a los dos a probar del vino de una cuba, pidiéndoles su parecer del
estado, cualidad, bondad o malicia del vino. El uno lo probó con la punta de la
lengua, el otro no hizo más de llegarlo a las narices. El primero dijo que
aquel vino sabía a hierro, el segundo dijo que más sabía a cordobán. El dueño
dijo que la cuba estaba limpia, y que el tal vino no tenía adobo alguno por
donde hubiese tomado sabor de hierro ni de cordobán. Con todo eso, los dos
famosos mojones se afirmaron en lo que habían dicho. Anduvo el tiempo, vendióse
el vino, y al limpiar de la cuba hallaron en ella una llave pequeña, pendiente
de una correa de cordobán.
Don Quixote, Part Two, Chapter XIII."""
#------------------------------------------------------
def getDataFromTXT(fileName,
iField, #MSFFileCol
jField = "", #RAWFileCol
kField = "", #scanNumberCol
lField = "", #chargeCol
mField = "", #pepSequenceCol
nField = "", #XCorrCol
initialRow = 1,
filterString = "",
removeDuplicates = True,
removeCommas = True):
# adapted from Aljamia v1.04
result = []
thisHeader = ""
reader = open(fileName, "r")
currentRowNumber = 1 # one based!
for myRow in reader:
if currentRowNumber >= initialRow:
thisRow = myRow.strip().split("\t")
if removeCommas:
for i in xrange(len(thisRow)):
if thisRow[i].endswith('"') and thisRow[i].startswith('"'):
thisRow[i] = thisRow[i][1:-1]
if currentRowNumber == initialRow:
# remove brackets if present
for i in xrange(len(thisRow)):
if thisRow[i].startswith('[') and thisRow[i].endswith(']'):
thisRow[i] = thisRow[i][1:-1]
thisHeader = thisRow
else:
# check for fields
filterOk = aljamia.checkFilter(filterString, isXML = False, currentRow = thisRow, headers = thisHeader)
if filterOk:
dataRow = []
if len(iField) > 0:
iValue = aljamia.replaceValuesTXT(thisRow, thisHeader, iField)
dataRow.append(iValue)
if len(jField) > 0:
jValue = aljamia.replaceValuesTXT(thisRow, thisHeader, jField)
dataRow.append(jValue)
if len(kField) > 0:
kValue = aljamia.replaceValuesTXT(thisRow, thisHeader, kField)
dataRow.append(kValue)
if len(lField) > 0:
lValue = aljamia.replaceValuesTXT(thisRow, thisHeader, lField)
dataRow.append(lValue)
if len(mField) > 0:
mValue = aljamia.replaceValuesTXT(thisRow, thisHeader, mField)
dataRow.append(mValue)
if len(nField) > 0:
nValue = aljamia.replaceValuesTXT(thisRow, thisHeader, nField)
dataRow.append(nValue)
result.append(dataRow)
currentRowNumber += 1
if removeDuplicates:
result = stats.removeDuplicates(result)
return result
#------------------------------------------------------
def main(options, programName, programVersion):
## REGION: DEFAULT VALUES AND VARIABLE ACQUISITION
# *-*-* add any default values here, such as default names of files
# basic default info
inputFile = ""
MSFFileCol = ""
RAWFileCol = ""
scanNumberCol = ""
chargeCol = ""
pepSequenceCol = ""
XCorrCol = ""
initialRow = 1
verbose = False
QuiXMLResults = False # otherwise, QuiXML will be considered
changeOriginalMSFFile = False # False = copy MSF to MSF_zeroed, True = change original file
logFile = ""
analysisName = ""
analysisFolder = ""
defaultAnalysisName = programName.lower()
# default extensions
defaultTableExtension = ".xls"
defaultTextExtension = ".txt"
# default file names
defaultLogFile = "logFile"
leadingFile = "" # *-*-* change this by the data file or any important file defining the operation
# basic log file
logList = [[programName + " " + programVersion], ["Start: " + strftime("%Y-%m-%d %H:%M:%S")]]
# parsing arguments from commandline
# *-*-* add any arguments used
options.add_argument("-a", "--analysis", type = str, default = "", required = True, help = "Use a prefix for the output files.")
options.add_argument("-p", "--place", type = str, default = "", required = True, help = "To use a different common folder for the output files. If this is not provided, the the folder used will be the same as the FASTA file folder.")
options.add_argument("-L", "--logfile", type = str, default = "", required = False, help = "To use a non-default name for the log file.")
# -d[nombre archivo donde están los xcorr = 0]
# -M[columna con nombre del msf]
# -r[rawfilecol]
# -s[scannumbercol]
# -q[chargecol]
# -P[pepsequencecol]
# -x[xcorrcol]
# -R[initialrow]
options.add_argument("-d", "--inputfile", type = str, default = "", required = True, help = "Name of the text file containing the list of PSMs to keep in the MSF.")
options.add_argument("-M", "--msffile", type = str, default = "", required = True, help = "Name of the MSF file having the PSMs to modify.")
options.add_argument("-r", "--rawfilecol", type = str, default = "", required = False, help = "Header of the column contaning the name of the RAW files. Default is \"RAWFileName\".")
options.add_argument("-s", "--scannumbercol", type = str, default = "", required = False, help = "Header of the column containing the scan numbers. Default is \"FirstScan\".")
options.add_argument("-q", "--chargecol", type = str, default = "", required = False, help = "Header of the column containing the charge. Default is \"Charge\".")
options.add_argument("-S", "--pepsequencecol", type = str, default = "", required = False, help = "Header of the column containing the identified peptide sequences. Default is \"Sequence\"")
options.add_argument("-x", "--xcorrcol", type = str, default = "", required = False, help = "Header of the column containing the XCorr. Default is \"XC1D\".")
options.add_argument("-R", "--initialrow", type = str, default = "1", required = False, help = "The position of the row containing the headers. Default is 1.")
options.add_argument("-v", "--verbose", action='store_true', help = "Show extra info while operating.")
options.add_argument("-Q", "--quixml", action='store_true', help = "Use column headers for QuiXML results tab separated table file (otherwise, pRatio results file headers will be used by default).")
options.add_argument("-O", "--changeoriginalmsf", action='store_true', help = "Do not copy the MSF file to be modified, just remove bad PSMs in the original file.")
# *-*-* add easter egg if wanted
arguments = options.parse_args()
# copying parsed arguments
# copy any arguments used
verbose = arguments.verbose
QuiXMLResults = arguments.quixml
changeOriginalMSFFile = arguments.changeoriginalmsf
if QuiXMLResults:
RAWFileCol = "RAWFileName"
scanNumberCol = "FirstScan"
chargeCol = "Charge"
pepSequenceCol = "Sequence"
XCorrCol = "XC1D"
initialRow = 24
else: # pRatio results are default
RAWFileCol = "RAWFile"
scanNumberCol = "FirstScan"
chargeCol = "Charge"
pepSequenceCol = "Sequence"
XCorrCol = "Xcorr1Search"
initialRow = 1
if len(arguments.analysis) > 0: analysisName = arguments.analysis
if len(arguments.place) > 0: analysisFolder = arguments.place
if len(arguments.logfile) > 0: logfile = arguments.logFile
if len(arguments.inputfile) > 0: inputFile = arguments.inputfile
if len(arguments.msffile) > 0: MSFFile = arguments.msffile
if len(arguments.rawfilecol) > 0: RAWFileCol = arguments.rawfilecol
if len(arguments.scannumbercol) > 0: scanNumberCol = arguments.scannumbercol
if len(arguments.chargecol) > 0: chargeCol = arguments.chargecol
if len(arguments.pepsequencecol) > 0: pepSequenceCol = arguments.pepsequencecol
if len(arguments.xcorrcol) > 0: XCorrCol = arguments.xcorrcol
if len(arguments.initialrow) > 0: initialRow = int(arguments.initialrow)
RAWFileCol = "[" + RAWFileCol + "]"
scanNumberCol = "[" + scanNumberCol + "]"
chargeCol = "[" + chargeCol + "]"
pepSequenceCol = "[" + pepSequenceCol + "]"
XCorrCol = "[" + XCorrCol + "]"
## END REGION: DEFAULT VALUES AND VARIABLE ACQUISITION
## **********************************************************
## REGION: FILE NAMES SETUP
# replace leadingfile by the real leading file
if len(analysisName) == 0:
if len(leadingFile) > 0:
analysisName = os.path.splitext(os.path.basename(leadingFile))[0]
else:
analysisName = defaultAnalysisName
if len(os.path.dirname(analysisName)) > 0:
analysisNameFirstPart = os.path.dirname(analysisName)
analysisName = os.path.basename(analysisName)
if len(analysisFolder) == 0:
analysisFolder = analysisNameFirstPart
# input
# *-*-* add input file setup
if len(os.path.dirname(inputFile)) == 0:
inputFile = os.path.join(analysisFolder, inputFile)
if len(os.path.dirname(MSFFile)) == 0:
MSFFile = os.path.join(analysisFolder, MSFFile)
if os.path.splitext(MSFFile)[1] != ".msf":
print
print "Warning: your MSF file does not seem to be an MSF file."
if changeOriginalMSFFile:
newMSFFile = MSFFile
else:
newMSFFile = os.path.splitext(MSFFile)[0] + "_zeroed" + os.path.splitext(MSFFile)[1]
# output
# *-*-* add output file setup
if len(logFile) == 0:
logFile = os.path.join(analysisFolder, analysisName + "_" + defaultLogFile + defaultTextExtension)
if len(os.path.dirname(logFile)) == 0 and len(os.path.basename(logFile)) > 0:
logFile = os.path.join(analysisFolder, logFile)
# *-*-* add to the log the input and output filenames
logList.append([""])
logList.append(["Output log file: " + logFile])
logList.append([""])
## END REGION: FILE NAMES SETUP
## **********************************************************
## REGION: PROGRAM BASIC STRUCTURE
# *-*-* add basic structure
print
if not changeOriginalMSFFile:
message = "Creating new file: %s" % newMSFFile
logList.append([message])
print message
shutil.copyfile(MSFFile, newMSFFile)
message = "Loading PSMs to keep from: %s" % inputFile
logList.append([message])
print message
myList = getDataFromTXT(inputFile,
iField = RAWFileCol,
jField = scanNumberCol,
kField = chargeCol,
lField = pepSequenceCol,
initialRow = initialRow,
filterString = XCorrCol + ">0",
removeDuplicates = True,
removeCommas = True)
if len(myList) == 0:
message = "\nWarning!! No PSMs found in the text file provided,\nDeleting all XCorrs.\n"
else:
message = "PSMs found in text file: %i" % len(myList)
logList.append([message])
print message
compareListIdentified = []
for element in myList:
myRAWFilePath = element[0]
myRAWFile = os.path.basename(myRAWFilePath)
myScanNumber = element[1]
myCharge = element[2]
myPepSequence = element[3] # currently unused
compareListIdentified.append([myRAWFile, int(myScanNumber), int(myCharge)])
connexion = sqlite3.connect(newMSFFile)
c = connexion.cursor()
existingScanQuery = """select
p.peptideid,
fi.filename,
sh.firstscan,
sh.lastscan,
sh.charge,
p.sequence,
ps.scorevalue
from
peptides p,
peptideScores ps,
spectrumHeaders sh,
massPeaks mp,
fileInfos fi,
processingNodeScores scoreNames
where
p.peptideid = ps.peptideid and
sh.spectrumid = p.spectrumid and
(fi.fileid = mp.fileid or mp.fileid = -1) and
mp.masspeakid = sh.masspeakid and
scoreNames.scoreid = ps.scoreid and
scoreNames.ScoreName = 'Xcorr'
order by fi.filename desc,
sh.firstscan asc,
sh.lastscan asc,
sh.charge asc,
ps.scorevalue desc;"""
message = "Searching all PSMs in database..."
logList.append([message])
print message
wholeMSFList = []
for psm in c.execute(existingScanQuery):
wholeMSFList.append(psm)
checked = 0
changed = 0
alreadyZero = 0
message = "Zeroing...!"
logList.append([message])
message = "Original MSF contains %i PSMs." % len(wholeMSFList)
logList.append([message])
logList.append([""])
logList.append(["Zeroed\tRAWFileName\tScanNumber\tCharge\tSequence (without PTMs)\tOriginal XCorr"])
print message
for psm in wholeMSFList:
checked += 1
pepId = int(psm[0])
rawFilePath = str(psm[1])
rawFile = os.path.basename(rawFilePath)
firstScan = int(psm[2])
lastScan = int(psm[3])
charge = int(psm[4])
sequence = str(psm[5])
XCorr = float(psm[6])
# now check whether this is in the given list. If not, then make XCorr = 0
# compareListIdentified.append([myRAWFile, myScanNumber, myCharge, myPepSequence])
compareListInMSF = [rawFile, firstScan, charge]
msfScanPresentInList = (compareListInMSF in compareListIdentified)
if XCorr == 0:
alreadyZero += 1
if not msfScanPresentInList and XCorr != 0:
changed += 1
zeroSettingQuery = """update
peptideScores
set
scoreValue = 0
where
peptideID in (
select
p.peptideID
from
fileinfos fi,
massPeaks mp,
spectrumHeaders sh,
peptides p
where
(fi.fileid = mp.fileid or mp.fileid = -1) and
mp.masspeakid = sh.masspeakid and
sh.spectrumid = p.spectrumid and
sh.firstScan = %i and
sh.charge = %i and
fi.fileName like "%%%s"
) and
scoreID = (
select
scoreId
from
processingNodeScores
where
scoreName = "XCorr"
);""" % (firstScan, charge, rawFile)
# pdb.set_trace()
c.execute(zeroSettingQuery)
message = """%i\t%s\t%i\t%i\t%s\t%f""" % (checked, rawFile, firstScan, charge, sequence, XCorr)
logList.append([message])
if verbose:
message = """Zeroed (%i/%i):\nraw = "%s"\nscan = %i\ncharge = %i\nsequence = %s\nXCorr = %f\n""" % (checked, len(wholeMSFList), rawFile, firstScan, charge, sequence, XCorr)
print message
print
message = "PTMs found: %i,\nPTMs zeroed: %i,\nDifference: %i,\nAlready zero: %i" % (checked, changed, checked - changed, alreadyZero)
print message
logList.append([""])
logList.append([message])
print
print "Saving changes..."
connexion.commit()
print
print "Closing connexion..."
connexion.close()
## END REGION: PROGRAM BASIC STRUCTURE
## **********************************************************
## REGION: SAVING FILES
# *-*-* add any files to be saved here
try:
# operations here
logList.append(["Most probably, everything went fine."])
except getopt.GetoptError:
logList.append(["Error."])
stats.saveFile(logFile, logList, "LOG FILE")
sys.exit(2)
stats.saveFile(logFile, logList, "LOG FILE")
print "Done!"
## END REGION: SAVING FILES
#######################################################
if __name__ == "__main__":
# *-*-* change programStarter by the real name of the program
programName = "cataPep"
# *-*-* change programVersion by the program version (yeah, this is tricky)
programVersion = "v1.03"
# *-*-* change the programUse for another to be included in the help
# make sure it makes sense within the sentence "used to ..."
programUse = "make zero the XCorrs of all the PSMs in an MSF file, excluding those provided in a list."
main(argparse.ArgumentParser(description = u"%s %s is a program made in the Jesus Vazquez Cardiovascular Proteomics Lab at Centro Nacional de Investigaciones Cardiovasculares, used to %s." % (programName, programVersion, programUse)), programName, programVersion)