-
Notifications
You must be signed in to change notification settings - Fork 1
/
TargetscanScores.py
182 lines (137 loc) · 8.51 KB
/
TargetscanScores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# TargetscanScores.py
# class to read a targetscan Summary_Counts.txt type file and parse it into gene-specific information with reasonable accessor methods
from collections import defaultdict
class TargetscanScores:
def __init__(self, targetscanScoreFile):
# compute:
# 1) sum of context+ scores
# 2) min context+ score
# 3) # of sites
# 4) sum of conserved site context+ scores
# 5) min conserved context+ score
# 6) # of conserved sites
self.refSeqToScores = defaultdict(list) # stores a list of all miRNA sites per refseq ID
self.refSeqToScoreSum = defaultdict(float) # stores the sum of context+ scores per refseq ID
self.refSeqToMinScore = defaultdict(float) # stores the min context+ score per refseq ID
self.refSeqToNumSites = defaultdict(int) # stores the number of sites per refseq ID
self.refSeqToCnsvScores = defaultdict(list) # stores a list of all conserved miRNA sites per refseq ID
self.refSeqToCnsvScoreSum = defaultdict(float) # stores a list of the sum of scores for conserved site per refseq ID
self.refSeqToCnsvMinScore = defaultdict(float) # stores the min conserved score per refseq ID
self.refSeqToCnsvNumSites = defaultdict(int) # stores the number of conserved sites
self.refSeqToNoncnsvNumSites = defaultdict(int) # stores the number of nonconserved sites
self.refSeqToNoncnsvScores = defaultdict(list) # stores a list of all conserved miRNA sites per refseq ID
self.refSeqToNoncnsvScoreSum = defaultdict(float) # stores a list of the sum of scores for conserved site per refseq ID
self.refSeqToNoncnsvMinScore = defaultdict(float) # stores the min conserved score per refseq ID
self.refSeqToCnsv8merSites = defaultdict(int) # stores the number of conserved 8mer sites
self.refSeqToCnsv7merm8Sites = defaultdict(int) # cnsv 7mer-m8 sites
self.refSeqToCnsv7mer1aSites = defaultdict(int) # cnsv 7mer-1a sites
self.refSeqToNoncnsv8merSites = defaultdict(int) # stores the number of nonconserved 8mer sites
self.refSeqToNoncnsv7merm8Sites = defaultdict(int) # noncnsv 7mer-m8 sites
self.refSeqToNoncnsv7mer1aSites = defaultdict(int) # noncnsv 7mer-1a sites
#0 Transcript ID 1 Gene Symbol 2 miRNA family 3 Species ID 4 Total num conserved sites 5 Number of conserved 8mer sites 6 Number of conserved 7mer-m8 sites 7 Number of conserved 7mer-1a sites
# 8 Total num nonconserved sites 9 Number of nonconserved 8mer sites 10 Number of nonconserved 7mer-m8 sites 11 Number of nonconserved 7mer-1a sites 12 Representative miRNA 13 Total context score 14 Aggregate PCT
#NM_000014 A2M AAAAGUG 9606 0 0 0 0 1 0 1 0 hsa-miR-548t -0.128 NULL
# skip the header
targetscanScoreFile.readline()
for line in targetscanScoreFile:
if (not line.strip()):
continue
line = line.split()
if (line[13] == "NULL"):
continue
isConserved = int(line[4])
self.refSeqToScores[line[0]].append( (line[12], float(line[13]) ) )
self.refSeqToScoreSum[line[0]] += float(line[13])
self.refSeqToNumSites[line[0]] += int(line[4]) + int(line[8])
if (isConserved):
self.refSeqToCnsvScores[line[0]].append( (line[12], float(line[13]) ) )
self.refSeqToCnsvScoreSum[line[0]] += float(line[13])
self.refSeqToCnsv8merSites[line[0]] += int(line[5])
self.refSeqToCnsv7merm8Sites[line[0]] += int(line[6])
self.refSeqToCnsv7mer1aSites[line[0]] += int(line[7]) # cnsv 7mer-1a sites
self.refSeqToCnsvNumSites[line[0]] += int(line[4])
else:
self.refSeqToNoncnsvScores[line[0]].append( (line[12], float(line[13]) ) )
self.refSeqToNoncnsvScoreSum[line[0]] += float(line[13])
self.refSeqToNoncnsv8merSites[line[0]] += int(line[9])
self.refSeqToNoncnsv7merm8Sites[line[0]] += int(line[10])
self.refSeqToNoncnsv7mer1aSites[line[0]] += int(line[11]) # cnsv 7mer-1a sites
self.refSeqToNoncnsvNumSites[line[0]] += int(line[8])
#0 Transcript ID 1 Gene Symbol 2 miRNA family 3 Species ID 4 Total num conserved sites 5 Number of conserved 8mer sites 6 Number of conserved 7mer-m8 sites 7 Number of conserved 7mer-1a sites
# 8 Total num nonconserved sites 9 Number of nonconserved 8mer sites 10 Number of nonconserved 7mer-m8 sites 11 Number of nonconserved 7mer-1a sites 12 Representative miRNA 13 Total context score 14 Aggregate PCT
# compute meta properties here (min score, etc)
for key,val in self.refSeqToScores.iteritems():
self.refSeqToMinScore[key] = min([score[1] for score in val])
for key,val in self.refSeqToCnsvScores.iteritems():
self.refSeqToCnsvMinScore[key] = min([score[1] for score in val])
for key,val in self.refSeqToNoncnsvScores.iteritems():
self.refSeqToNoncnsvMinScore[key] = min([score[1] for score in val])
def getScores(self, refSeqGene):
return self.refSeqToScores[refSeqGene]
def getScoreSum(self, refSeqGene):
return self.refSeqToScoreSum[refSeqGene]
def getMinScore(self, refSeqGene):
return self.refSeqToMinScore[refSeqGene]
def getNumSites(self, refSeqGene):
return self.refSeqToNumSites[refSeqGene]
def getCnsvScores(self, refSeqGene):
return self.refSeqToCnsvScores[refSeqGene]
def getCnsvScoreSum(self, refSeqGene):
return self.refSeqToCnsvScoreSum[refSeqGene]
def getCnsvMinScore(self, refSeqGene):
return self.refSeqToCnsvMinScore[refSeqGene]
def getCnsvNumSites(self, refSeqGene):
return self.refSeqToCnsvNumSites[refSeqGene]
def getNoncnsvNumSites(self, refSeqGene):
return self.refSeqToNoncnsvNumSites[refSeqGene]
def getNoncnsvScores(self, refSeqGene):
return self.refSeqToNoncnsvScores[refSeqGene]
def getNoncnsvScoreSum(self, refSeqGene):
return self.refSeqToNoncnsvScoreSum[refSeqGene]
def getNoncnsvMinScore(self, refSeqGene):
return self.refSeqToNoncnsvMinScore[refSeqGene]
def getCnsv8merSites(self, refSeqGene):
return self.refSeqToCnsv8merSites[refSeqGene]
def getCnsv7merm8Sites(self, refSeqGene):
return self.refSeqToCnsv7merm8Sites[refSeqGene]
def getCnsv7mer1aSites(self, refSeqGene):
return self.refSeqToCnsv7mer1aSites[refSeqGene]
def getNoncnsv8merSites(self, refSeqGene):
return self.refSeqToCnsv8merSites[refSeqGene]
def getNoncnsv7merm8Sites(self, refSeqGene):
return self.refSeqToCnsv7merm8Sites[refSeqGene]
def getNoncnsv7mer1aSites(self, refSeqGene):
return self.refSeqToCnsv7mer1aSites[refSeqGene]
# print self.refSeqToScores
# print self.refSeqToScoreSum
# print self.refSeqToMinScore
# print "self.refSeqToNumSites"
# print self.refSeqToNumSites
# print "self.refSeqToCnsvScores"
# print self.refSeqToCnsvScores
# print "self.refSeqToCnsvScoreSum"
# print self.refSeqToCnsvScoreSum
# print "self.refSeqToCnsvMinScore"
# print self.refSeqToCnsvMinScore
# print "self.refSeqToNumCnsvSites"
# print self.refSeqToCnsvNumSites
# print "self.refSeqToNoncnsvNumSites"
# print self.refSeqToNoncnsvNumSites
# print "self.refSeqToNoncnsvScores"
# print self.refSeqToNoncnsvScores
# print "self.refSeqToNoncnsvScoreSum"
# print self.refSeqToNoncnsvScoreSum
# print "self.refSeqToNoncnsvMinScore"
# print self.refSeqToNoncnsvMinScore
# print "self.refSeqToCnsv8merSites"
# print self.refSeqToCnsv8merSites
# print "self.refSeqToCnsv7merm8Sites"
# print self.refSeqToCnsv7merm8Sites
# print "self.refSeqToCnsv7mer1aSites"
# print self.refSeqToCnsv7mer1aSites
# print "self.refSeqToNoncnsv8merSites"
# print self.refSeqToNoncnsv8merSites
# print "self.refSeqToNoncnsv7merm8Sites"
# print self.refSeqToNoncnsv7merm8Sites
# print "self.refSeqToNoncnsv7mer1aSites"
# print self.refSeqToNoncnsv7mer1aSites