-
Notifications
You must be signed in to change notification settings - Fork 2
/
profile.go
204 lines (184 loc) · 6.36 KB
/
profile.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
package seq
import (
"bytes"
"fmt"
"math"
"text/tabwriter"
)
// Profile represents a sequence profile in terms of log-odds scores.
type Profile struct {
// The columns of a profile.
Emissions []EProbs
// The alphabet of the profile. The length of the alphabet should be
// equal to the number of rows in the profile.
// There are no restrictions on the alphabet. (i.e., Gap characters are
// allowed but they are not treated specially.)
Alphabet Alphabet
}
// NewProfile initializes a profile with a default
// alphabet that is compatible with this package's BLOSUM62 matrix.
// Emission probabilities are set to the minimum log-odds probability.
func NewProfile(columns int) *Profile {
return NewProfileAlphabet(columns, AlphaBlosum62)
}
// NewProfileAlphabet initializes a profile with the given alphabet.
// Emission probabilities are set to the minimum log-odds probability.
func NewProfileAlphabet(columns int, alphabet Alphabet) *Profile {
emits := make([]EProbs, columns)
for i := 0; i < columns; i++ {
emits[i] = NewEProbs(alphabet)
}
return &Profile{emits, alphabet}
}
func (p *Profile) Len() int {
return len(p.Emissions)
}
func (p *Profile) String() string {
buf := new(bytes.Buffer)
tabw := tabwriter.NewWriter(buf, 4, 0, 3, ' ', 0)
pf := func(ft string, v ...interface{}) { fmt.Fprintf(tabw, ft, v...) }
for _, r := range p.Alphabet {
pf("%c", rune(r))
for _, eprobs := range p.Emissions {
pf("\t%0.4f", eprobs.Lookup(r))
}
pf("\n")
}
tabw.Flush()
return buf.String()
}
// FrequencyProfile represents a sequence profile in terms of raw frequencies.
// A FrequencyProfile is useful as an intermediate representation. It can be
// used to incrementally build a Profile.
type FrequencyProfile struct {
// The columns of a frequency profile.
Freqs []map[Residue]int
// The alphabet of the profile. The length of the alphabet should be
// equal to the number of rows in the frequency profile.
// There are no restrictions on the alphabet. (i.e., Gap characters are
// allowed but they are not treated specially.)
Alphabet Alphabet
}
func (fp *FrequencyProfile) String() string {
buf := new(bytes.Buffer)
tabw := tabwriter.NewWriter(buf, 4, 0, 3, ' ', 0)
pf := func(ft string, v ...interface{}) { fmt.Fprintf(tabw, ft, v...) }
for _, r := range fp.Alphabet {
pf("%c", rune(r))
for _, column := range fp.Freqs {
pf("\t%d", column[r])
}
pf("\n")
}
tabw.Flush()
return buf.String()
}
// NewNullProfile initializes a frequency profile that can be used to tabulate
// a null model. This is equivalent to calling NewFrequencyProfile with the
// number of columns set to 1.
func NewNullProfile() *FrequencyProfile {
return NewFrequencyProfile(1)
}
// NewFrequencyProfile initializes a frequency profile with a default
// alphabet that is compatible with this package's BLOSUM62 matrix.
func NewFrequencyProfile(columns int) *FrequencyProfile {
return NewFrequencyProfileAlphabet(columns, AlphaBlosum62)
}
// NewFrequencyProfileAlphabet initializes a frequency profile with the
// given alphabet.
func NewFrequencyProfileAlphabet(
columns int,
alphabet Alphabet,
) *FrequencyProfile {
freqs := make([]map[Residue]int, columns)
for i := 0; i < columns; i++ {
freqs[i] = make(map[Residue]int, len(alphabet))
for _, residue := range alphabet {
freqs[i][residue] = 0
}
}
return &FrequencyProfile{freqs, alphabet}
}
// Len returns the number of columns in the frequency profile.
func (fp *FrequencyProfile) Len() int {
return len(fp.Freqs)
}
// Combine adds the given frequency profile to the current one.
// Both profiles must have the same number of columns.
func (fp1 *FrequencyProfile) Combine(fp2 *FrequencyProfile) {
if fp1.Len() != fp2.Len() {
panic(fmt.Sprintf("Profile has length %d but other profile has "+
"length %d", fp1.Len(), fp2.Len()))
}
for c := 0; c < fp1.Len(); c++ {
for residue := range fp1.Freqs[c] {
fp1.Freqs[c][residue] += fp2.Freqs[c][residue]
}
}
}
// Add adds the sequence to the given profile. The sequence must have length
// equivalent to the number of columns in the profile. The sequence must also
// only contain residues that are in the alphabet for the profile.
//
// As a special case, if the alphabet contains the 'X' residue, then any
// unrecognized residues in the sequence with respect to the profile's alphabet
// will be considered as an 'X' residue.
func (fp *FrequencyProfile) Add(s Sequence) {
if fp.Len() != s.Len() {
panic(fmt.Sprintf("Profile has length %d but sequence has length %d",
fp.Len(), s.Len()))
}
for column := 0; column < fp.Len(); column++ {
r := s.Residues[column]
if _, ok := fp.Freqs[column][r]; ok {
fp.Freqs[column][r] += 1
} else if _, ok := fp.Freqs[column]['X']; ok {
fp.Freqs[column]['X'] += 1
} else {
panic(fmt.Sprintf("Unrecognized residue %c while using an "+
"alphabet without a wildcard: '%s'.", r, fp.Alphabet))
}
}
}
// Profile converts a raw frequency profile to a profile that uses a log-odds
// representation. The log-odds scores are computed with the given null model,
// which is itself just a raw frequency profile with a single column.
func (fp *FrequencyProfile) Profile(null *FrequencyProfile) *Profile {
if null.Len() != 1 {
panic(fmt.Sprintf("null model has %d columns; should have 1",
null.Len()))
}
if !fp.Alphabet.Equals(null.Alphabet) {
panic(fmt.Sprintf("freq profile alphabet '%s' is not equal to "+
"null profile alphabet '%s'.", fp.Alphabet, null.Alphabet))
}
p := NewProfileAlphabet(fp.Len(), fp.Alphabet)
// Compute the background emission probabilities.
nulltot := freqTotal(null.Freqs[0])
nullemit := make(map[Residue]float64, fp.Alphabet.Len())
for _, residue := range null.Alphabet {
nullemit[residue] = float64(null.Freqs[0][residue]) / float64(nulltot)
}
// Now compute the emission probabilities and convert to log-odds.
for column := 0; column < fp.Len(); column++ {
tot := freqTotal(fp.Freqs[column])
for _, residue := range fp.Alphabet {
if null.Freqs[0][residue] == 0 || fp.Freqs[column][residue] == 0 {
p.Emissions[column].Set(residue, MinProb)
} else {
prob := float64(fp.Freqs[column][residue]) / float64(tot)
logOdds := -Prob(math.Log(prob / nullemit[residue]))
p.Emissions[column].Set(residue, logOdds)
}
}
}
return p
}
// freqTotal computes the total frequency in a single column.
func freqTotal(column map[Residue]int) int {
tot := 0
for _, freq := range column {
tot += freq
}
return tot
}