-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDPTCTC.lua
172 lines (154 loc) · 4.86 KB
/
DPTCTC.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
require 'nnx'
---- inherit DataParallel table
local DPTCTC = torch.class('nn.DPTCTC','nn.DataParallelTable')
local ThreadsImplCTC = torch.class('nn.DPTCTC.Threads', 'nn.DataParallelTable.Threads')
function ThreadsImplCTC:__init(dpt, initFunc)
self.dpt = dpt
self.initFunc = initFunc
self.ctc = nn.CTCCriterion():cuda()
end
function ThreadsImplCTC:applyChanges()
if self.__threads then
local module = self.dpt.modules[1]
local ctc = self.ctc
for i, gpu in ipairs(self.dpt.gpuAssignments) do
self.__threads:addjob(i, function()
cutorch.setDevice(gpu)
if i == 1 then
_G.module = module
_G.ctc = ctc
else
_G.module = nil
_G.ctc = nil
collectgarbage()
_G.module = module:clone()
_G.ctc = ctc:clone()
end
end)
end
self.__threads:synchronize()
end
end
function ThreadsImplCTC:exec(closure)
self:setup()
local res = {}
for i=1,#self.dpt.gpuAssignments do
self.__threads:addjob(i,
function()
return closure(_G.module, i, _G.ctc)
end,
function (_res_)
res[i] = _res_
end)
end
self.__threads:synchronize()
return res
end
local function hasFlattenedParmeters(self)
if not self.flattenedParams then
return false
end
for _, param in ipairs(self.modules[1]:parameters()) do
if param:storage() ~= self.flattenedParams[1][1]:storage() then
return false
end
end
return true
end
-- extracts the value at idx from each entry in tbl
local function pluck(tbl, idx)
local r = {}
for n, val in ipairs(tbl) do
r[n] = val[idx]
end
return r
end
function DPTCTC:threads(initFunc)
require 'threads'
self.impl:close()
self.impl = nn.DPTCTC.Threads(self, initFunc)
return self
end
function DPTCTC:updateOutput(input)
if self.flattenParams and not hasFlattenedParmeters(self) then
self:flattenParameters()
end
if self.needsSync then
self:syncParameters()
end
local prevGpuid = cutorch.getDevice()
-- distribute the input to GPUs
self:_distribute(self.inputGpu, input)
-- update output for each module
local inputGpu = self.inputGpu
self.outputGpu = self.impl:exec(function(m, i)
if torch.isTensor(inputGpu[i]) and inputGpu[i]:numel() == 0 then
return torch.CudaTensor()
else
return m:updateOutput(inputGpu[i])
end
end)
self.output = self.outputGpu
cutorch.setDevice(prevGpuid)
return self.output
end
function DPTCTC:backward(input, target, size, scale)
return self:__backward_inner('backward', input, target, size, scale)
end
function DPTCTC:updateGradInput(input, target, size)
return self:__backward_inner('updateGradInput', input, target, size)
end
local function slice(tbl, first, last, step)
local sliced
if torch.type(tbl) == 'table' then
sliced = {}
for i = first or 1, last or #tbl, step or 1 do
sliced[#sliced+1] = tbl[i]
end
else
sliced = torch.CudaTensor()
sliced:resize(last-first+1):copy(sizes:select(first,last))
end
return sliced
end
function DPTCTC:__backward_inner(method, input, target, size, scale)
local prevGpuid = cutorch.getDevice()
local inputGpu = self.inputGpu
local outputGpu = self.outputGpu
local sizeGpu = {}
-- distribute the size to GPUs
self:_distribute(sizeGpu, size)
local batch_size = inputGpu[1]:size(1)
local loss = torch.Tensor(#self.gpuAssignments)
self.gradInputGpu = self.impl:exec(function(m, i, ctc)
if torch.isTensor(inputGpu[i]) and inputGpu[i]:numel() == 0 then
return torch.CudaTensor()
else
local targets_slice = slice(target, 1+(i-1)*batch_size, i*batch_size)
loss[i] = ctc:forward(outputGpu[i], targets_slice, sizeGpu[i])
local gradOutput = ctc:backward(outputGpu[i], targets_slice)
return m[method](m, inputGpu[i], gradOutput, scale)
end
end)
if method == 'backward' then
local params = self:moduleParameters()
-- Accumulate the gradients onto the base GPU
if self.flattenedParams and self.usenccl and not cudaLaunchBlocking then
if #self.gpuAssignments > 1 then
nccl.reduce(pluck(self.flattenedParams, 2), nil, true, 1)
end
else
self:_reduce(pluck(params, 2))
end
-- Zero out gradients on the other GPUs
for i = 2, #self.gpuAssignments do
cutorch.setDevice(self.gpuAssignments[i])
for _, gradParam in ipairs(params[i][2]) do
gradParam:zero()
end
end
self.needsSync = true
end
cutorch.setDevice(prevGpuid)
return loss:mean()
end