forked from glennhickey/teHmm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcfg.py
385 lines (340 loc) · 16.6 KB
/
cfg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
#!/usr/bin/env python
#Copyright (C) 2013 by Glenn Hickey
#
#Released under the MIT license, see LICENSE.txt
#!/usr/bin/env python
import os
import sys
import numpy as np
import pickle
import string
import logging
from collections import Iterable
from numpy.testing import assert_array_equal, assert_array_almost_equal
from .emission import IndependentMultinomialEmissionModel
from .track import TrackList, TrackTable, Track
from .hmm import MultitrackHmm
from .common import EPSILON, LOGZERO, myLog, logger
from ._cfg import fastCykTable
from .basehmm import normalize
from .basehmm import NEGINF
""" Generalize MultitrackHmm (hmm.py) to a Stochastic Context Free Grammer
(CFG) while preserving more or less the same interface (and using the same
emission model). The CFG is diferentiated by passing in a list of states
(which are all integers at this point) that trigger pair emissions (for example
these could be states corresponding to LTR or TSD). All other states are
treated as HMM states (ie emit one column at a time left-to-right. For now
these sets are disjoint, ie the nest states always emit pairs and the left to
right don't though that could be relaxed in theory. For now only the
supervised training is supported, and we donn't implement full EM algorithm.
All productions look like one of the following (nonterminals are capitals):
X -> Y Z
X -> a
X -> aa
X -> a Y a
The first two are Chmosky Normal Form, but we add the last two kinds because
we want to explicitly model pair emissions.
"""
class MultitrackCfg(object):
def __init__(self, emissionModel, pairEmissionModel,
nestStates=[], state_name_map = None):
""" For now we take in an emission model (that was used for the HMM)
and a list singling out a few states as nested / pair emission states"""
self.emissionModel = emissionModel
self.pairEmissionModel = pairEmissionModel
self.logProbs1 = None
self.logProbs2 = None
self.startProbs = None
self.defAlignmentSymbol = 0
self.PAIRFLAG = -2
self.trackList = None
self.stateNameMap = state_name_map
# all states that can emit a column
self.M = self.emissionModel.getNumStates()
self.emittingStates = np.arange(self.M)
# all states flagged as nest enabled (used by training)
self.nestStates = np.array(nestStates)
# all states that are LR only (ie the rest)
self.hmmStates = []
for i in self.emittingStates:
if i not in self.nestStates:
self.hmmStates.append(i)
self.hmmStates = np.array(self.hmmStates)
assert len(self.hmmStates) + len(self.nestStates) == self.M
# start with a basic flat distribution
self.initParams()
def getLogProbTables(self):
""" Direct access to the production probability tables. The first
talble is for CNF type production X->YZ, and the second one is
specifically for nested pair emission productions X->xYx"""
return self.logProbs1, self.logProbs2
def getStartProbs(self):
""" Get the (non-log to be compatible with hmm) start probabilities"""
return np.exp(self.startProbs)
def getTrackList(self):
return self.trackList
def viterbi(self, trackData, numThreads = 1):
""" Return the output of the Viterbi algorithm on the loaded
data: a tuple of (log likelihood of best path, and the path itself)
(one data point of each interval of track data)
"""
output = []
alignmentTrackTableList = trackData.getAlignmentTrackTableList()
alignmentTable = None
for i, trackTable in enumerate(trackData.getTrackTableList()):
if len(alignmentTrackTableList) > 0:
alignmentTable = alignmentTrackTableList[i]
prob, states = self.decode(trackTable,
alignmentTrack=alignmentTable,
numThreads=numThreads)
if self.stateNameMap is not None:
states = map(self.stateNameMap.getMapBack, states)
output.append((prob,states))
return output
def createTables(self):
""" all probabailities of the form X -> Y Z (note that we will only
ever use a small subset. will keep track using a shortcut table for
iterations """
self.logProbs1 = NEGINF + np.zeros((self.M, self.M, self.M),
dtype = np.float)
""" all probabilities of the form X -> aYa (ie Y is nested inside of X)
note that we are not counting probability of emitting a """
self.logProbs2 = NEGINF + np.zeros((self.M, self.M), dtype = np.float)
def createHelperTables(self):
""" to avoid iterating over all possible productions when many may
have zero probaiblities, we keep arrays of nozero productions """
self.helper1 = []
self.helperDim1 = np.zeros((self.M,), dtype=np.int32)
maxRow = 0
for state in self.emittingStates:
stateList = []
for next1 in self.emittingStates:
for next2 in self.emittingStates:
if self.logProbs1[state, next1, next2] > NEGINF:
stateList.append([next1, next2])
maxRow = max(len(stateList), maxRow)
self.helperDim1[state] = len(stateList)
self.helper1.append(stateList)
npHelper1 = -1 + np.zeros((self.M, maxRow, 2), dtype=np.int32)
for state in xrange(len(self.helper1)):
for i, entry in enumerate(self.helper1[state]):
assert len(entry) == 2
npHelper1[state, i] = entry
self.helper1 = npHelper1
self.helper2 = []
self.helperDim2 = np.zeros((self.M,), dtype=np.int32)
maxRow = 0
for state in self.emittingStates:
stateList = []
for nextState in self.emittingStates:
if self.logProbs2[state, nextState] > NEGINF:
stateList.append(nextState)
maxRow = max(len(stateList), maxRow)
self.helperDim2[state] = len(stateList)
self.helper2.append(stateList)
npHelper2 = -1 + np.zeros((self.M, maxRow), dtype=np.int32)
for state in xrange(len(self.helper2)):
for i, entry in enumerate(self.helper2[state]):
npHelper2[state, i] = entry
self.helper2 = npHelper2
def initParams(self):
""" Allocate the production (transition) probability matrix and
initialize it to a flat distribution where everything has equal prob."""
self.createTables()
# flat start probs over all states
oneOfAny = myLog(1. / self.M)
self.startProbs = oneOfAny + np.zeros((self.M,), dtype=np.float)
# hmm states flat of X - > X Y (where Y is any state)
for state in self.hmmStates:
for nextState in self.emittingStates:
self.logProbs1[state, state, nextState] = oneOfAny
# cfg states X -> a Y a (prob Y nested in X). X is a nested state
# but Y is any state
# (-1 below to correct counting for state == nextState case)
oneOfCfgRHS = myLog(1. / (3.0 * self.M - 1.0))
for state in self.nestStates:
for nextState in self.emittingStates:
self.logProbs1[state, state, nextState] = oneOfCfgRHS
self.logProbs1[state, nextState, state] = oneOfCfgRHS
self.logProbs2[state, nextState] = oneOfCfgRHS
# create shortcut tables:
self.createHelperTables()
self.validate()
def validate(self):
""" check that all the probabilities in the production matrix rows add
up to one"""
total = 0.
for state in self.emittingStates:
total += np.exp(self.startProbs[state])
assert_array_almost_equal(total, 1.)
for origin in self.emittingStates:
total = 0.
for next1 in self.emittingStates:
for next2 in self.emittingStates:
total += np.exp(self.logProbs1[origin, next1, next2])
total += np.exp(self.logProbs2[origin, next1])
assert_array_almost_equal(total, 1.)
# now verify the helper tables exhibit the same behaviour
for origin in self.emittingStates:
total = 0.
for i in xrange(self.helperDim1[origin]):
total += np.exp(self.logProbs1[origin,
self.helper1[origin, i][0],
self.helper1[origin, i][1]])
for i in xrange(self.helperDim2[origin]):
total += np.exp(self.logProbs2[origin,
self.helper2[origin, i]])
assert_array_almost_equal(total, 1.)
def __initDPTable(self, obs, alignmentTrack):
""" Create the 2D dynamic programming table for CYK etc. and initialise
all the 1-length entries for each (emitting) state"""
# Create a dynamic programming traceback (for CYK) table to remember
#which states got used
self.tb = -1 + np.zeros((len(obs), len(obs), self.M, 3),
dtype = np.int64)
self.dp = NEGINF + np.zeros((len(obs), len(obs), self.M),
dtype = np.float)
self.emLogProbs = self.emissionModel.allLogProbs(obs)
# todo: how fast is this type of loop?
assert len(self.emLogProbs) == len(obs)
for i in xrange(len(obs)):
for j in self.hmmStates:
self.dp[i,i,j] = self.emLogProbs[i,j]
baseMatch = alignmentTrack is not None
# pair emissions where emitted columns are right beside eachother
for i in xrange(len(obs)-1):
match = baseMatch and\
alignmentTrack[i,0] != self.defAlignmentSymbol and\
alignmentTrack[i,0] == alignmentTrack[i+1,0]
for j in self.nestStates:
self.dp[i,i+1,j] = self.pairEmissionModel.pairLogProb(
j, self.emLogProbs[i,j], self.emLogProbs[i+1,j], match)
self.tb[i, i+1, j] = [self.PAIRFLAG, 0, 0]
def __cyk(self, obs, alignmentTrack = None, numThreads=1):
""" Do the CYK dynamic programming algorithm (like viterbi) to
compute the maximum likelihood CFG derivation of the observations."""
self.__initDPTable(obs, alignmentTrack)
if isinstance(obs, TrackTable):
obs = obs.getNumPyArray()
assert obs.dtype == np.uint8
if alignmentTrack is None:
alignmentTrack = np.ndarray((0,1), dtype = np.uint16)
if isinstance(alignmentTrack, TrackTable):
alignmentTrack = alignmentTrack.getNumPyArray()
assert alignmentTrack.dtype == np.uint16
fastCykTable(self, obs, alignmentTrack, numThreads)
score = max([self.startProbs[i] + self.dp[0, len(obs)-1, i] \
for i in self.emittingStates])
return score
def __traceBack(self, obs):
""" depth first search to determine most likely states from the
traceback table (self.tb) that was constructed during __cyk"""
trace = -1 + np.zeros(len(self.dp))
top = np.argmax([self.startProbs[i] + self.dp[0, len(obs)-1, i]\
for i in xrange(self.M)])
tbRecurseStack = [(0, len(self.tb) - 1, top, trace)]
while len(tbRecurseStack) > 0:
i, j, state, trace = tbRecurseStack.pop()
self.assigned = 0
assert i >= 0
assert j >= i
size = j - i + 1
if size == 1:
trace[i] = state
self.assigned += 1
else:
(k, r1State, r2State) = self.tb[i, j, state]
if k == self.PAIRFLAG:
#cheap hack, k set to -2 to flag a pair emission
trace[i] = state
trace[j] = state
assert r1State == r2State
if size > 2:
tbRecurseStack.append((i+1, j-1, r1State, trace))
else:
tbRecurseStack.append((i, k, r1State, trace))
tbRecurseStack.append((k+1, j, r2State, trace))
assert self.assigned <= len(obs)
return trace
def decode(self, obs, alignmentTrack = None, defAlignmentSymbol=0,
numThreads=1):
""" return tuple of log prob and most likely state sequence. same
as in the hmm. """
self.defAlignmentSymbol = defAlignmentSymbol
if numThreads > 1:
logger.info("%d threads activated for CYK" % numThreads)
return self.__cyk(obs, alignmentTrack,
numThreads=numThreads), self.__traceBack(obs)
def supervisedTrain(self, trackData, bedIntervals):
""" Production porbabilites determined by frequencies two states
are adjacent in the training data. In fact, we mostly piggyback
off the HMM training for now, using left-right adjacency as proxy
for nesting events as well (not sure there's much way around this
in fact..."""
self.trackList = trackData.getTrackList()
self.initParams()
hmm = MultitrackHmm(self.emissionModel)
hmm.supervisedTrain(trackData, bedIntervals)
assert (self.startProbs.shape == hmm.getStartProbs().shape)
self.startProbs = myLog(hmm.getStartProbs())
# map hmm transitions to X -> X Y productions
# for each hmm transition, P(X->Y) we let
# P(X->XY) = = P(X->YX) = P(X->Y) /2
hmmProbs = hmm.transmat_
for lState in self.hmmStates:
for rState in self.emittingStates:
hp = hmmProbs[lState, rState]
if lState != rState:
hp /= 2.
self.logProbs1[lState, lState, rState] = myLog(hp)
self.logProbs1[lState, rState, lState] = myLog(hp)
# like above but we also split across the table2
for lState in self.nestStates:
for rState in self.emittingStates:
hp = hmmProbs[lState, rState]
if lState != rState:
hp /= 3.
else:
hp /= 2.
self.logProbs1[lState, lState, rState] = myLog(hp)
self.logProbs1[lState, rState, lState] = myLog(hp)
self.logProbs2[lState, rState] = myLog(hp)
# make sure the helper tables are up to date
self.createHelperTables()
def __str__(self):
""" Pretty print model. Note -- too much code duplicated here and in
hmm (ie especially emission stuff..) TODO: merge up somehow"""
hmmStates = self.hmmStates
nestStates = self.nestStates
states = xrange(self.M)
if self.stateNameMap is not None:
states = map(self.stateNameMap.getMapBack, states)
hmmStates = map(self.stateNameMap.getMapBack, self.hmmStates)
nestStates = map(self.stateNameMap.getMapBack, self.nestStates)
s = "\nNumStates = %d:\nSingle:%s\nPair:%s\n" % (self.M, str(hmmStates),
str(nestStates))
sp = [(states[i], self.startProbs[i])
for i in xrange(self.M)]
s += "\nStart probs =\n%s\n" % str(sp)
s += "\nlogTable1 = \n%s\n" % str(self.logProbs1)
s += "\nlogTable2 = \n%s\n" % str(self.logProbs2)
em = self.emissionModel
s += "\nNumber of symbols per track=\n%s\n" % str(
em.getNumSymbolsPerTrack())
s += "\nEmissions =\n"
emProbs = em.getLogProbs()
for state, stateName in enumerate(hmmStates):
s += "State %s:\n" % stateName
for trackNo in xrange(em.getNumTracks()):
track = self.trackList.getTrackByNumber(trackNo)
s += " Track %d %s (%s):\n" % (track.getNumber(),
track.getName(),
track.getDist())
numSymbolsPerTrack = em.getNumSymbolsPerTrack()
for idx, symbol in enumerate(em.getTrackSymbols(trackNo)):
symbolName = track.getValueMap().getMapBack(symbol)
prob = np.exp(emProbs[trackNo][state][symbol])
if idx <= 2 or prob > 0.01:
s += " %s) %s: %f (log=%f)\n" % (symbol, symbolName,
prob, myLog(prob))
return s