-
Notifications
You must be signed in to change notification settings - Fork 0
/
filtered_sam_to_intervals.py
executable file
·1061 lines (874 loc) · 32.9 KB
/
filtered_sam_to_intervals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
"""
Filter and convert a SAM file to a list of intervals/reads
References:
[1] The SAM Format Specification (samtools.github.io/hts-specs/SAMv1.pdf)
[2] Using eval() safely in python (lybniz2.sourceforge.net/safeeval.html)
"""
from sys import argv,stdin,stderr,exit
from math import *
from re import compile
from sam_reader import samFieldToColumn,SAM_MIN_COLUMNS,SAM_QNAME_COLUMN
try: from hashlib import md5 as md5_new
except ImportError: from md5 import new as md5_new
def usage(s=None):
message = """
usage: cat sam_file | filtered_sam_to_intervals [options]
--namesorted the sam file has been sorted by read names
--mergemates[=[<min>..<max>] merge the intervals that have the same name,
and discard any singletons, multi-chromosomal, or
that are outside the expected insert length
--requiremates[=[<min>..<max>] same as --mergemates, in that mated pairs are
required, but intervals are still output separately
--require:<criterion> (cumulative) ignore any input lines that don't
satisfy the specified criterion; this is a
statement that evaluates to true if the line should
be kept, or false if the line should be discarded
--prohibit:<criterion> (cumulative) ignore any input lines that satisfy
the specified criterion
--subset:qname=<K>/<N> conceptually split the input file into <N> groups,
and only process the <K>th group; <K> ranges from 1
to <N>; the split utilizes a hash code of the query
name, so all records with the same name are in the
same subset (this means pairs will remain paired)
--chromosome[s]=<names> (cumulative) only output intervals on the specified
chromosomes; <names> is a comma-separated list
(default is to report intervals on all chromosomes)
--origin=one output intervals as origin-one, closed
--origin=zero output intervals as origin-zero, half-open
(this is the default)
--nonames don't output <read_name> as 4th column
--samrecords output entire sam record starting at our 4th column
--justsamrecords only output sam records
--report:<variable> (cumulative) report additional fields
--head=<number> limit the number of input records; note that
records are counted *before* filtering is performed
--progress=<number> periodically report how many records we've read
--progress=output:<number> periodically report how many records we've written
By default, the output file is a list of <chrom> <start> <end> <read_name>,
but if --nonames is used, it is just a list of <chrom> <start> <end>
Criteria for requirements and prohibitions are something like python
expressions. Some examples are
(CIGAR == *)
(RNEXT == =)
(FLAGS & 0x9CF == 0x049) or (FLAGS & 0x9CF == 0x089)
(0x004 in FLAGS)
(UNCLIP > RLEN*0.10)
Note that this program has to parse the expressions and convert them to
actual python statements. This saves the user from having to deal with the
shell handling of quote marks-- imagine the effort to get the shell to pass
the program quote marks in (CIGAR == "*") for example. Further, the parsing
allows us to figure out which variables are actually needed in the expression
context, which allows us to skip the computation of values that aren't
needed. The caveat is that this parsing is simplistic. General expressions
aren't supported. It is hoped that those that are supported are useful.
Names in expresssions come in three flavors. First, they can be field names
from the SAM spec:
QNAME FLAG RNAME POS MAPQ CIGAR
RNEXT PNEXT TLEN SEQ QUAL
POS1 POS for first read in a pair (when --mergemates or
POS2 POS for second read in a pair .. --requiremates is used)
Second, they can be aliases for tags:
SCORE alias for AS tag
SUBOPT alias for XS tag
Third, they can be names we compute from the SAM record:
RLEN length of the the read
BESTBY difference of alignment score minus suboptimal score
MAPCLIP number of bases remaining in the alignment after any clipping
UNCLIP number of bases clipped from both ends of alignment (not mapped)
LUNCLIP number of bases clipped from left end of read (not mapped)
RUNCLIP number of bases clipped from right end of read (not mapped)
MINCLIP minimum number of bases clipped from either end of read (not mapped)
CLIPBRK position of breakpoint suggested by clipped read
MORIENT orientation of mate within pair (1F, 1R, 2F, or 2R)
PORIENT orientation of pair (H2H or T2T)
FLAGS (same as FLAG)
There are also a few canned criteria:
broken mates stands for (FLAGS&0x9CF == 0x049) or (FLAGS&0x9CF == 0x089)"""
if (s == None): exit (message)
else: exit ("%s\n%s" % (s,message))
def main():
global subsetN,subsetK
global knownCriteria,computedVariables,tagToVariable,knownVariables,pairVariables
global variablesNeeded,tagsNeeded,requirements,prohibitions
global headLimit,reportProgress,progressId,writtenProgress
global outputWhat,mergeEm,mergeDistanceMin,mergeDistanceMax
global origin
global debug
knownCriteria = { \
"head to head" : (None,", use (PORIENT==H2H)"),
"head-to-head" : (None,", use (PORIENT==H2H)"),
"tail to tail" : (None,", use (PORIENT==T2T)"),
"tail-to-tail" : (None,", use (PORIENT==T2T)"),
"broken mates" : "(FLAGS & 0x9CF == 0x049) or (FLAGS & 0x9CF == 0x089)",
"broken mates with unmapped" : "(FLAGS & 0x9CF == 0x049) or (FLAGS & 0x9CF == 0x089) or (FLAGS & 0x9CF == 0x085)"
}
tagToVariable = { \
"AS" : "SCORE", # primary alignment score
"XS" : "SUBOPT" # next best alignment score
}
computedVariables = { \
"RLEN" : (read_length, []),
"MAPCLIP" : (after_clip_length, []),
"UNCLIP" : (clip_length, []),
"LUNCLIP" : (left_clip_length, []),
"RUNCLIP" : (right_clip_length, []),
"MINCLIP" : (min_clip_length, []),
"CLIPBRK" : (clip_breakpoint, []),
"BESTBY" : (score_best_by, ["SCORE","SUBOPT"]),
"MORIENT" : (mate_orientation, []),
"PORIENT" : (pair_orientation, [])
}
knownVariables = ["QNAME", "FLAG" , "RNAME", "POS" , "MAPQ" , "CIGAR",
"RNEXT", "PNEXT", "TLEN" , "SEQ" , "QUAL" , "FLAGS"]
pairVariables = ["POS1","POS2"]
for tag in tagToVariable:
variable = tagToVariable[tag]
knownVariables += [variable]
for variable in computedVariables:
knownVariables += [variable]
# parse the command line
isNameSorted = False
mergeEm = False
mergeDistanceMin = None
mergeDistanceMax = None
mergeButSeparate = False
requirements = []
prohibitions = []
subsetN = None
subsetK = None
chromsOfInterest = None
origin = "zero"
outputWhat = ["interval","name"]
headLimit = None
reportProgress = None
writtenProgress = None
progressId = None
debug = []
for arg in argv[1:]:
if ("=" in arg):
argVal = arg.split("=",1)[1]
if (arg == "--namesorted"):
isNameSorted = True
elif (arg == "--mergemates") \
or (arg == "--requiremates"):
mergeEm = True
mergeButSeparate = (arg == "--requiremates")
mergeDistanceMin = None
mergeDistanceMax = None
elif (arg.startswith("--mergemates=")) \
or (arg.startswith("--requiremates=")):
mergeEm = True
mergeButSeparate = (arg.startswith("--requiremates="))
if (".." not in argVal):
mergeDistanceMin = None
mergeDistanceMax = int_with_unit(argVal)
else:
(argMin,argMax) = argVal.split("..",1)
if (argMax == ""):
mergeDistanceMin = int_with_unit(argMin)
mergeDistanceMax = None
elif (argMin == ""):
mergeDistanceMin = None
mergeDistanceMax = int_with_unit(argMax)
else:
mergeDistanceMin = int_with_unit(argMin)
mergeDistanceMax = int_with_unit(argMax)
elif (arg.startswith("--require:")):
argVal = arg.split(":",1)[1]
requirements += [argVal.strip()]
elif (arg.startswith("--prohibit:")):
argVal = arg.split(":",1)[1]
prohibitions += [argVal.strip()]
elif (arg.startswith("--subset:qname=")) or (arg.startswith("--subset:name=")):
assert ("/" in argVal)
(subsetK,subsetN) = argVal.split("/",1)
subsetN = int(subsetN)
subsetK = int(subsetK)
assert (0 < subsetK <= subsetN)
elif (arg.startswith("--chromosome=")) or (arg.startswith("--chromosomes=")) \
or (arg.startswith("--chrom=")) or (arg.startswith("--chroms=")):
if (chromsOfInterest == None): chromsOfInterest = []
chromsOfInterest += argVal.split(",")
elif (arg.startswith("--origin=")):
origin = argVal
if (origin == "0"): origin = "zero"
if (origin == "1"): origin = "one"
assert (origin in ["zero","one"]), "can't understand %s" % arg
elif (arg == "--nonames"):
outputWhat = [x for x in outputWhat if (x != "name")]
elif (arg == "--samrecords"):
outputWhat = [x for x in outputWhat if (x not in ["name","sam record"])]
outputWhat += ["sam record"]
elif (arg == "--justsamrecords") or (arg == "--justsam"):
outputWhat = ["sam record"]
elif (arg.startswith("--report:")):
argVal = arg.split(":",1)[1]
for variable in argVal.split(","):
outputWhat += [variable.strip()]
elif (arg.startswith("--head=")):
headLimit = int_with_unit(argVal)
elif (arg.startswith("--progress=output:")) or (arg.startswith("--progress=written:")):
writtenProgress = int_with_unit(argVal.split(":",1)[1])
elif (arg.startswith("--progress=")):
if (":" in argVal):
(progressId,reportProgress) = argVal.split(":",1)
reportProgress = int_with_unit(reportProgress)
else:
reportProgress = int_with_unit(argVal)
elif (arg == "--debug"):
debug += ["debug"]
elif (arg.startswith("--debug=")):
debug += argVal.split(",")
elif (arg.startswith("--")):
usage("unrecognized option: %s" % arg)
else:
usage("unrecognized option: %s" % arg)
if (mergeEm) and (not mergeButSeparate):
extras = [x for x in outputWhat if (x not in ["interval","name","sam record"])]
if (extras != []):
usage("--report with --mergemates is not implemented yet")
# preprocess any requirements, changing them into python statements
variablesNeeded = set()
incoming = requirements
requirements = {}
for criterion in incoming:
alias = criterion
if (criterion in knownCriteria):
alias = knownCriteria[criterion]
if (alias == None):
usage("requirement no longer supported: \"%s\"" % criterion)
elif (type(alias) != str):
assert (alias[0] == None)
usage("requirement no longer supported: \"%s\"%s" % (criterion,alias[1]))
try:
(expression,variables) = criterion_to_python(alias)
requirements[criterion] = (expression,variables)
for name in variables: variablesNeeded.add(name)
except ValueError:
usage("uninterpretable requirement: \"%s\"" % criterion)
incoming = prohibitions
prohibitions = {}
for criterion in incoming:
alias = criterion
if (criterion in knownCriteria):
alias = knownCriteria[criterion]
if (alias == None):
usage("prohibition no longer supported: \"%s\"" % criterion)
elif (type(alias) != str):
assert (alias[0] == None)
usage("prohibition no longer supported: \"%s\"%s" % (criterion,alias[1]))
try:
(expression,variables) = criterion_to_python(alias)
prohibitions[criterion] = (expression,variables)
for name in variables: variablesNeeded.add(name)
except ValueError:
usage("uninterpretable prohibition: \"%s\"" % criterion)
for variable in outputWhat:
if (variable in ["interval","name","sam record"]): continue
variablesNeeded.add(variable)
for variable in computedVariables:
if (variable not in variablesNeeded): continue
(_,dependencies) = computedVariables[variable]
for name in dependencies:
variablesNeeded.add(name)
variablesNeeded = list(variablesNeeded)
variablesNeeded.sort()
if (not mergeEm) and (not mergeButSeparate):
pairVariablesNeeded = [variable for variable in variablesNeeded
if (variable in pairVariables)]
assert (pairVariablesNeeded == []), \
"pair variables can't be used without --mergemates or --requiremates\n" \
+ " pair variables: %s" \
% ",".join(pairVariablesNeeded)
else:
pairVariablesNeeded = [variable for variable in variablesNeeded
if (variable in pairVariables)]
assert (pairVariablesNeeded == []), \
"sorry, pair variables aren't implemented yet\n" \
+ " pair variables: %s" \
% ",".join(pairVariablesNeeded)
tagsNeeded = []
for tag in tagToVariable:
variable = tagToVariable[tag]
if (variable in variablesNeeded):
tagsNeeded += [tag]
tagsNeeded.sort()
if ("evaluation" in debug) and (requirements != {}):
print >>stderr, "=== requirements ==="
for criterionStr in requirements:
(criterion,vars) = requirements[criterionStr]
if ([var for var in vars if (var in pairVariables)] == []):
print >>stderr, " \"%s\" evaluated as \"%s\"" % (criterionStr,criterion)
else:
print >>stderr, " \"%s\" pair-evaluated as \"%s\"" % (criterionStr,criterion)
if ("evaluation" in debug) and (prohibitions != {}):
print >>stderr, "=== prohibitions ==="
for criterionStr in prohibitions:
(criterion,vars) = prohibitions[criterionStr]
if ([var for var in vars if (var in pairVariables)] == []):
print >>stderr, " \"%s\" evaluated as \"%s\"" % (criterionStr,criterion)
else:
print >>stderr, " \"%s\" pair-evaluated as \"%s\"" % (criterionStr,criterion)
if ("context" in debug):
print >>stderr, "=== variables needed ==="
for name in variablesNeeded:
print >>stderr, " %s" % name
if ("context" in debug) and (tagsNeeded != []):
print >>stderr, "=== tags needed ==="
for tag in tagsNeeded:
print >>stderr, " %s" % tag
# process the SAM file
readIntervals = None
readToIntervals = None
if (mergeEm):
if (isNameSorted): prevQName = None
else: readToIntervals = {}
for sam in read_sam_simple(stdin):
(lineNumber,samRecord,context) = sam
if ("input" in debug):
print >>stderr, lineNumber,context["QNAME"],context["RNAME"]
rName = context["RNAME"]
if (chromsOfInterest != None) and (rName not in chromsOfInterest):
continue
cigar = context["CIGAR"]
if (cigar == "*"):
rName = rPos = start = end = "*"
else:
extent = cigar_to_extent(cigar,lineNumber=lineNumber)
if (extent == None): continue
rPos = context["POS"]
(left,right) = extent
start = rPos - left
end = rPos + right
if (start < 0): start = 0
assert (end >= start), "start > end (%d > %d) at line %d" \
% (start,end,lineNumber)
if (origin == "one"): start += 1
qName = context["QNAME"]
if (not mergeEm):
write_interval(rName,start,end,qName,context,samRecord,rPos,cigar)
continue
if (isNameSorted):
if (prevQName == None):
readIntervals = []
prevQName = qName
elif (qName != prevQName):
write_merged_interval(prevQName,readIntervals,reportSeparate=mergeButSeparate)
readIntervals = []
prevQName = qName
readIntervals += [(rName,start,end,context,samRecord)]
continue
if (qName not in readToIntervals):
readToIntervals[qName] = [(rName,start,end,context,samRecord)]
else:
readToIntervals[qName] += [(rName,start,end,context,samRecord)]
# output merged intervals
if (readIntervals != None):
write_merged_interval(prevQName,readIntervals,reportSeparate=mergeButSeparate)
elif (readToIntervals != None):
for qName in readToIntervals:
write_merged_interval(qName,readToIntervals[qName],reportSeparate=mergeButSeparate)
def write_merged_interval(qName,intervals,reportSeparate=False):
if (len(intervals) == 1): return
if (not reportSeparate):
rNames = {}
for (rName,_,_,_,_) in intervals:
rNames[rName] = True
if (len(list(rNames)) != 1): return
intervals.sort()
(rName1,start1,end1,context1,samRecord1) = intervals[0]
end = max([e for (_,_,e,_,_) in intervals])
if (reportSeparate):
intervals = [(e,s,rName,context,samRecord) for (rName,s,e,context,samRecord) in intervals[1:]]
intervals.sort()
(end2,start2,rName2,context2,samRecord2) = intervals[-1]
if (mergeDistanceMin != None) and (end-start1 < mergeDistanceMin):
return
if (mergeDistanceMax != None) and (end-start1 > mergeDistanceMax):
return
if (reportSeparate):
write_interval(rName1,start1,end1,qName,context1,samRecord1)
write_interval(rName2,start2,end2,qName,context2,samRecord2)
else:
write_interval(rName,start1,end,qName)
numberWritten = 0
def write_interval(rName,start,end,qName,
context=None,samRecord=None,rPos=None,cigar=None):
global numberWritten
line = []
if ("interval" in outputWhat):
line += ["%s\t%s\t%s" % (rName,start,end)]
if ("name" in outputWhat):
if ("cigar" not in debug):
line += [qName]
else:
if (rPos == None): rPos = -1
if (cigar == None): cigar = "(none)"
line += ["%s\t%d\t%s" % (qName,rPos,cigar)]
for variable in outputWhat:
if (variable in ["interval","name","sam record"]): continue
if (context == None) or (variable not in context):
line += ["(%s)" % variable]
elif (variable == "FLAGS"):
line += ["0x%03X" % context[variable]]
else:
line += [str(context[variable])]
if ("sam record" in outputWhat):
if (samRecord == None): samRecord = "(sam)"
line += [samRecord]
print "\t".join(line)
numberWritten += 1
if (writtenProgress != None) and (numberWritten % writtenProgress == 0):
progressCount = commatize(numberWritten)
if (outputWhat == ["sam record"]): writingWhat = "sam records"
else: writingWhat = "intervals"
if (progressId == None):
print >>stderr, "progress: %s %s written" % (progressCount,writingWhat)
else:
print >>stderr, "progress: %s / %s %s written" % (progressId,progressCount,writingWhat)
def read_sam_simple(f):
lineNumber = recordNumber = 0
for line in stdin:
lineNumber += 1
line = line.strip()
if (line.startswith("@")):
if (outputWhat == ["sam record"]): # (nothing but sam is being output)
if (line.startswith("@SQ")): print line
continue
recordNumber += 1
if (reportProgress != None) and (recordNumber % reportProgress == 0):
progressCount = commatize(recordNumber)
if (progressId == None):
print >>stderr, "progress: %s sam records read" % progressCount
else:
print >>stderr, "progress: %s / %s sam records read" % (progressId,progressCount)
if (headLimit != None) and (recordNumber > headLimit):
print >>stderr, "limit of %d sam records reached" % headLimit
break
fields = line.split()
numFields = len(fields)
assert (numFields >= SAM_MIN_COLUMNS), \
"not enough columns at line %d (%d, expected %d)" \
% (lineNumber,numFields,SAM_MIN_COLUMNS)
if ("evaluation" in debug):
print >>stderr
print >>stderr, "line %d: \"%s\"" % (lineNumber," ".join(fields[:4]))
# if we are only to process a named-based subset, filter out any reads
# not in that subset
if (subsetN != None):
hashVal = md5_new()
hashVal.update(fields[SAM_QNAME_COLUMN])
hashK = 1 + (int(hashVal.hexdigest()[:25],16) % subsetN)
if (hashK != subsetK): continue
# create the context in which we will evaluate the criteria
context = dict(safeDict)
context["LINENUMBER"] = lineNumber
for name in samFieldToColumn:
col = samFieldToColumn[name]
context[name] = fields[col]
try:
context["FLAG" ] = int_or_string(context["FLAG" ])
context["POS" ] = int_or_string(context["POS" ])
context["MAPQ" ] = int_or_string(context["MAPQ" ])
context["PNEXT"] = int_or_string(context["PNEXT"])
context["TLEN" ] = int_or_string(context["TLEN" ])
except ValueError:
assert (False), "bad SAM record at line %d\n%s" % (lineNumber,line)
context["FLAGS"] = context["FLAG"]
if (type(context["POS"]) != str): context["POS"] -= 1
for field in fields[SAM_MIN_COLUMNS:]:
(tag,typeCode,val) = field.split(":",2)
if (tag not in tagsNeeded):
continue
variable = tagToVariable[tag]
if (typeCode == "i"): context[variable] = int(val)
elif (typeCode == "f"): context[variable] = float(val)
else: context[variable] = val
for tag in tagsNeeded:
variable = tagToVariable[tag]
if (variable not in context): context[variable] = None
for variable in computedVariables:
if (variable not in variablesNeeded): continue
(func,_) = computedVariables[variable]
context[variable] = func(context)
if ("context" in debug) or ("flags" in debug):
values = [(name,context[name]) for name in context if (name not in safeDict)]
if ("flags" in debug):
values = [(name,context[name]) for name in context if (name == "FLAGS")]
values.sort()
print >>stderr, "=== context for line %d ===" % lineNumber
nameWidth = max([len(name) for (name,_) in values])
for (name,val) in values:
if (name in ["SEQ","QUAL"]) and (len(val) > 20):
val = "\"%s...\"" % val[:20]
elif (name in ["FLAG","FLAGS"]):
val = flags_to_string(val)
elif (type(val) == str): val = "\"%s\"" % val
print >>stderr, " %-*s = %s" % (nameWidth,name,val)
# filter, by evaluating the requirements and prohibitions
reject = False
for criterionStr in requirements:
(criterion,_) = requirements[criterionStr]
if ("evaluation" in debug):
print >>stderr, "evaluating requirement \"%s\"" % criterion
try:
val = eval(criterion,{"__builtins__":None},context)
except NameError:
print >>stderr, "failed to evaluate requirement \"%s\"" % criterion
raise
#if ("evaluation" in debug):
# print >>stderr, " (evaluates to %s)" % val
if (val == False):
reject = True
if ("evaluation" in debug): print >>stderr, " (rejected)"
break
if (val != True):
assert (False), "requirement \"%s\" evaluates to \"%s\" on SAM record at line %d\n%s\nevaluated as \"%s\"" \
% (criterionStr,val,lineNumber,line,criterion)
if (reject): continue
for criterionStr in prohibitions:
(criterion,_) = prohibitions[criterionStr]
if ("evaluation" in debug):
print >>stderr, "evaluating prohibition \"%s\"" % criterion
try:
val = eval(criterion,{"__builtins__":None},context)
except NameError:
print >>stderr, "failed to evaluate prohibition \"%s\"" % criterion
raise
#if ("evaluation" in debug):
# print >>stderr, " (evaluates to %s)" % val
if (val == True):
reject = True
if ("evaluation" in debug): print >>stderr, " (rejected)"
break
if (val != False):
assert (False), "prohibition \"%s\" evaluates to \"%s\" on SAM record at line %d\n%s\n(evaluated as \"%s\")" \
% (criterionStr,lineNumber,line,criterion)
if (reject): continue
# if it made it through all that, keep it
yield (lineNumber,line,context)
# functions to support "special variables"
def read_length(context):
return len(context["SEQ"])
def after_clip_length(context):
cigar = context["CIGAR"]
cigarInfo = split_cigar(cigar)
unclip = len(context["SEQ"])
if (cigarInfo != None):
(rpt,op) = cigarInfo.operations[0]
if (op in ["S","H"]): unclip -= rpt
if (len(cigarInfo.operations) > 1):
(rpt,op) = cigarInfo.operations[-1]
if (op in ["S","H"]): unclip -= rpt
return unclip
def clip_length(context):
cigar = context["CIGAR"]
cigarInfo = split_cigar(cigar)
clip = 0
if (cigarInfo != None):
(rpt,op) = cigarInfo.operations[0]
if (op in ["S","H"]): clip += rpt
if (len(cigarInfo.operations) > 1):
(rpt,op) = cigarInfo.operations[-1]
if (op in ["S","H"]): clip += rpt
return clip
def left_clip_length(context):
cigar = context["CIGAR"]
cigarInfo = split_cigar(cigar)
if (cigarInfo != None):
(rpt,op) = cigarInfo.operations[0]
if (op in ["S","H"]): return rpt
return 0
def right_clip_length(context):
cigar = context["CIGAR"]
cigarInfo = split_cigar(cigar)
if (cigarInfo != None):
(rpt,op) = cigarInfo.operations[-1]
if (op in ["S","H"]): return rpt
return 0
def min_clip_length(context):
(lftClip,rgtClip) = cigar_to_clip_lengths(context["CIGAR"])
return min(lftClip,rgtClip)
def clip_breakpoint(context):
extent = cigar_to_extent(context["CIGAR"])
if (extent == None): return "(NO_CLIPBRK)"
(left,right) = extent
(lftClip,rgtClip) = cigar_to_clip_lengths(context["CIGAR"])
if (lftClip == 0) and (rgtClip == 0): return "(NO_CLIPBRK)"
rPos = context["POS"]
if (lftClip >= rgtClip):
start = rPos - left
if (start < 0): start = 0
if (origin == "one"): start += 1
return start
else:
end = rPos + right
return end
def score_best_by(context):
score = context["SCORE"]
subopt = context["SUBOPT"]
if (score == None): score = 0
if (subopt == None): subopt = 0
return score - subopt
def mate_orientation(context):
flags = context["FLAG"]
if (flags & 0xFF9 == 0x061): return "1F";
if (flags & 0xFF9 == 0x091): return "2R";
if (flags & 0xFF9 == 0x051): return "1R";
if (flags & 0xFF9 == 0x0A1): return "2F";
return "(MORIENT)"
def pair_orientation(context):
flags = context["FLAG"]
tLen = context["TLEN"]
if (tLen > 0):
if (flags & 0xFF9 in [0x061,0x0A1]): return "H2H";
if (flags & 0xFF9 in [0x091,0x051]): return "T2T";
elif (tLen < 0):
if (flags & 0xFF9 in [0x091,0x051]): return "H2H";
if (flags & 0xFF9 in [0x061,0x0A1]): return "T2T";
return "(PORIENT)"
# flags_to_string--
nybbleToBits = { 0:"0000", 5:"0101", 10:"1010", 15:"1111",
1:"0001", 6:"0110", 11:"1011",
2:"0010", 7:"0111", 12:"1100",
3:"0011", 8:"1000", 13:"1101",
4:"0100", 9:"1001", 14:"1110"}
def flags_to_string(val):
(v,nybbles) = (val,1)
while (v > 16):
(v,nybbles) = (v/16,nybbles+1)
nybbles = max(nybbles,3)
s = []
v = val
while (nybbles > 0):
s += [nybbleToBits[v % 16]]
v /= 16
nybbles -= 1
s += ["0x%0*X =" % (nybbles,val)]
s.reverse()
return " ".join(s)
# cigar string processing--
def cigar_to_extent(cigar,lineNumber=None):
cigarInfo = split_cigar(cigar)
if (cigarInfo == None): return None
left = 0
(rpt,op) = cigarInfo.operations[0]
if (op == "S"): left += rpt
right = left
for (rpt,op) in cigarInfo.operations:
if (op in ["M","X","=","D","N"]):
right += rpt
elif (op in ["I","S"]):
pass
else:
if (lineNumber == None):
assert (False), "unsupported \"%d%s\" in cigar %s" \
% (rpt,op,cigar)
else:
assert (False), "unsupported \"%d%s\" in cigar %s (line %d)" \
% (rpt,op,cigar,lineNumber)
return (0,right-left)
def cigar_to_clip_lengths(cigar):
cigarInfo = split_cigar(cigar)
lftClip = rgtClip = 0
if (cigarInfo != None):
(rpt,op) = cigarInfo.operations[0]
if (op in ["S","H"]): lftClip = rpt
if (len(cigarInfo.operations) > 1):
(rpt,op) = cigarInfo.operations[-1]
if (op in ["S","H"]): rgtClip = rpt
return (lftClip,rgtClip)
class CigarInfo: pass
def split_cigar(cigar):
if (cigar == "*"): return None
# split the cigar into a list of (count,operation)
operations = []
rpt = []
for ch in cigar:
if (ch.isdigit()):
rpt += [ch]
else:
assert (rpt != []), "bad cigar: \"%s\"" % cigar
operations += [(int("".join(rpt)),ch)]
rpt = []
assert (rpt == []), "bad cigar: \"%s\"" % cigar
# trim clipping operators from the ends
startClip = endClip = 0
if (operations != []):
(rpt,op) = operations[0]
if (op == "H"):
startClip = rpt
operations = operations[1:]
if (operations != []):
(rpt,op) = operations[-1]
if (op == "H"):
endClip = rpt
operations = operations[:-1]
splitCigar = CigarInfo()
splitCigar.operations = operations
splitCigar.startClip = startClip
splitCigar.endClip = endClip
return splitCigar
# evaluation context stuff--
# (see reference [2], lybniz2.sourceforge.net/safeeval.html)
def round_int(x): return int(round(x))
def floor_int(x): return int(floor(x))
def ceil_int(x): return int(ceil(x))
def log2(x): return log(x) / log(2.0)
safeList = ["e", "exp", "log", "log10", "pi", "pow", "sqrt"]
safeDict = dict([(k,locals().get(k,None)) for k in safeList])
safeDict["abs"] = abs
safeDict["int"] = int
safeDict["float"] = float
safeDict["ceil"] = ceil_int
safeDict["floor"] = floor_int
safeDict["round"] = round_int
safeDict["log2"] = log2
safeDict["max"] = max
safeDict["min"] = min
# criterion to python expression conversion--
eqnVar = "[A-Z]+"
eqnInt = "[0-9]+"
eqnFloat = "[0-9]*\.[0-9]*"
eqnHex = "0x[0-9A-Fa-f]+"
eqnArith = "\*|/|\+|\-"
eqnOperator = "==|!=|>|<|>=|<=|in|not in"
eqnStrConst = "\*|=|None|1F|1R|2F|2R|H2H|T2T"
eqnObject = eqnVar + "|" + eqnInt + "|" + eqnHex + "|" + eqnFloat + "|" + eqnStrConst
eqnExpression = eqnVar + "(" + eqnArith + ")" + "(" + eqnInt + "|" + eqnFloat + ")"
eqnFlags = "FLAGS *& *0x[0-9A-Fa-f]+"
eqnRe = compile("\( *"
+ "(?P<left>" + eqnObject + "|" + eqnExpression + "|" + eqnFlags + ")"
+ " *"
+ "(?P<operator>" + eqnOperator + ")"
+ " *"
+ "(?P<right>" + eqnObject + "|" + eqnExpression + ")"
+ " *\)")
eqnExpressionRe = compile(
"(?P<left>" + eqnVar + ")"
+ " *"
+ "(?P<operator>" + eqnArith + ")"
+ " *"
+ "(?P<right>" + eqnVar + "|" + eqnInt + "|" + eqnFloat + ")")
eqnFlagsRe = compile(eqnFlags)
def criterion_to_python(s):
if ("expressions" in debug):
print >>stderr, "converting \"%s\" to python" % s
expression = []
variables = set()
while (True):
m = eqnRe.search(s)
if (m == None):
expression += [s]
break
text = m.group(0)
(prefix,suffix) = s.split(text,1)
left = m.group("left")
op = m.group("operator")
right = m.group("right")
if ("expressions" in debug):
print >>stderr, " left = %s" % left
print >>stderr, " op = %s" % op
print >>stderr, " right = %s" % right
(text,names) = modify_expression(text,left,op,right)
expression += [prefix,text]
s = suffix
for name in names:
variables.add(name)
expression = "".join(expression)
variables = list(variables)
if ("expressions" in debug):
print >>stderr, " --> \"%s\"" % expression
print >>stderr, " --> [%s]" % ",".join(variables)
return (expression,variables)
def modify_expression(s,left,op,right):
(preOp,postOp) = s[len(left)+1:-len(right)-1].split(op)
(left ,leftVarNames) = sanify(left)
(right,rightVarNames) = sanify(right)
varNames = var_names(leftVarNames,rightVarNames)
if (op == "in") and (right in ["FLAG","FLAGS"]):
left = "FLAGS & %s" % left
op = "!="
right = "0"
elif (op == "not in") and (right in ["FLAG","FLAGS"]):
left = "FLAGS & %s" % left
op = "=="
right = "0"
expression = "".join(["(",left,preOp,op,postOp,right,")"])
return (expression,varNames)
def sanify(s):
if (s in knownVariables) or (s in pairVariables):
return (s,[s])
if (s == "None"):
return (s,None)
if (eqnFlagsRe.match(s) != None):
return (s,["FLAGS"])
m = eqnExpressionRe.search(s)
if (m != None):
(left ,leftVarNames) = sanify(m.group("left"))
(right,rightVarNames) = sanify(m.group("right"))
if (type(left) == str) and (left.startswith ("\"")): raise ValueError
if (type(right) == str) and (right.startswith("\"")): raise ValueError
varNames = var_names(leftVarNames,rightVarNames)
return (s,varNames)
try:
validate_number(s)
return (s,None)
except ValueError:
return ("\"" + s + "\"",None)
def validate_number(s):
if (s.upper().startswith("0X")):
_ = int(s[2:],16) # just validation, exception raised if it fails
return
try: _ = int(s)
except ValueError: _ = float(s)
def var_names(names1,names2):
varNames = []
if (names1 != None):