From 5d373d0f20fba969f1956f4f9331099c8cb024da Mon Sep 17 00:00:00 2001 From: SimonDanisch Date: Mon, 26 Feb 2018 23:24:23 +0100 Subject: [PATCH] stab at softmax kernel --- src/nlib.jl | 266 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 src/nlib.jl diff --git a/src/nlib.jl b/src/nlib.jl new file mode 100644 index 00000000..547b1bd6 --- /dev/null +++ b/src/nlib.jl @@ -0,0 +1,266 @@ +using GPUArrays, MacroTools + +# unroll helper +""" +Unrolls like ntuple, but allows to pass args - +this helps with avoiding boxing due to closure capturing +""" +@generated function arg_ntuple(f::F, n::Type{Val{N}}, args::Vararg{Any, N2}) where {F, N, N2} + args_un = ntuple(i-> :(args[$i]), N2) + tup = ntuple(i-> :(f($i, $(args_un...))), N) + quote + Base.@_inline_meta + ($(tup...),) + end +end + + +""" +Like arg_ntuple, but offers a reduction variable v0, that doesn't get boxed as it +would be the case with a closure. +""" +@generated function nreduce(f, ::Type{Val{N}}, v0, args::NTuple{N2, Any}) where {N, N2} + args_un = ntuple(i-> :(args[$i]), N2) + reduced = ntuple(i-> :(v0 = f($i, v0, $(args_un...))), N) + quote + Base.@_inline_meta + $(reduced...) + end +end + +""" +Unroll macro for static ranges +""" +macro unroll(forloop) + @capture(forloop, for idx_ in from_ : to_; body__; end) || throw( + ArgumentError("Needs to be a for loop. Found: $forloop") + ) + expr = Expr(:block); body = esc.(body) + expr.args = map(from:to) do i + :($(esc(idx)) = $i; $(body...)) + end +end + + +function blockReduce( + state, + smem::AbstractArray{AccumT}, + val::AccumT, + op, + v0::AccumT + ) where AccumT + # To avoid RaW races from chaining blockReduce calls together, we need a sync here + synchronize_threads(state) + + smem[threadidx_x(state)] = val + + synchronize_threads(state) + + warpVal = v0; + # First warp will perform per-warp reductions for the remaining warps + ui32 = UInt32(32) + ui1 = UInt32(1) + if threadidx_x(state) < ui32 + lane = (threadidx_x(state) - ui1) % ui32 + if lane < blockdim_x(state) ÷ ui32 + @unroll for idx in 1:32 + warpVal = op(warpVal, smem[lane * ui32 + UInt32(idx)]) + end + smem[ui1 + lane] = warpVal + end + end + + synchronize_threads(state) + + # First thread will perform a reduction of the above per-warp reductions + blockVal = v0 + + if threadidx_x(state) == ui1 + for i in ui1:(((blockdim_x(state) - ui1) ÷ ui32) + ui1) + blockVal = op(blockVal, smem[i]) + end + smem[ui1] = blockVal + end + + # Sync and broadcast + synchronize_threads(state) + return smem[ui1] +end + +function ilpReduce( + state, + data::AbstractArray{T}, + data_offset, + size, + op, + v0::AccumT, + ilp::Type{Val{ILP}} + ) where {T, AccumT, ILP} + + ILP32 = UInt32(ILP) + threadVal = v0 + ui1 = UInt32(1) + offset = threadidx_x(state) - ui1 + + last = size % (ILP32 * blockdim_x(state)) + + # Body (unroll by ILP times) + while offset < (size - last) + tmp = arg_ntuple(ilp, ui1 + data_offset + offset, data) do j, off, d + d[ui1 + off + UInt32(j - 1) * blockdim_x(state)] + end + threadVal = nreduce(ilp, threadVal, tmp) do idx, v0, a + op(v0, a[idx]) + end + offset += blockdim_x(state) * ILP32 + end + + # Epilogue + while offset < size + threadVal = op(threadVal, data[ui1 + data_offset + offset]) + offset += blockdim_x(state) + end + + return threadVal +end + + + +function softmax_forward( + state, + output::AbstractArray{T}, + input::AbstractArray{T}, + ::Type{AccumT}, + classes::UInt32, + ::Type{Val{SMemSize}}, + ilp::Type{Val{ILP}}, + ::Type{Epilogue} + ) where {T, AccumT, SMemSize, ILP, Epilogue} + + ui1 = UInt32(1) + ILP32 = UInt32(ILP) + smem = @LocalMemory(state, AccumT, SMemSize) + # forward pointers to batch[blockidx_x(state)] + # each block handles a sample in the mini-batch + global_offset = (blockidx_x(state) - ui1) * classes; + + # find the max + thread_max = ilpReduce(state, input, global_offset, classes, max, -typemax(AccumT), ilp) + max_k = blockReduce(state, smem, thread_max, max, -typemax(AccumT)) + max_k_non_accum = T(max_k) + + # reduce all values + threadExp = ilpReduce( + state, input, global_offset, classes, + (sum, v)-> sum + exp(v - max_k_non_accum), AccumT(0), ilp + ) + sumAll = blockReduce(state, smem, threadExp, +, AccumT(0)) + + offset = threadidx_x(state) - ui1 + last = classes % (ILP32 * blockdim_x(state)) + epilogue = Epilogue(max_k_non_accum, sumAll) + while offset < (classes - last) + tmp = arg_ntuple(ilp, ui1 + global_offset + offset) do j, off + input[ui1 + off + UInt32(j - 1) * blockdim_x(state)] + end + for j in ui1:ILP32 + idx = global_offset + offset + UInt32(j - 1) * blockdim_x(state) + output[ui1 + idx] = apply(epilogue, tmp[j]) + end + offset += blockdim_x(state) * ILP32 + end + + while offset < classes + output[ui1 + offset] = apply(epilogue, input[ui1 + global_offset + offset]) + offset += blockdim_x(state) + end +end + + +function getblocksize(ILP::Integer, dim_size::Integer) + block_size = 1; + max_block_size = min(dim_size ÷ ILP, 1024) + while block_size < max_block_size; block_size *= 2; end + # Launch at least a single warp - the kernel assumes that. + block_size = max(block_size, 32) + return (block_size,) +end + +struct SoftMaxForwardEpilogue{T1, T2} + sum::T1 + max_input::T2 +end +apply(x::SoftMaxForwardEpilogue, input) = exp(input - x.max_input) / x.sum + +function HostSoftMaxForward( + input::AbstractArray{T}, output::AbstractArray{T}, + outer_size::Int, dim_size::Int, inner_size::Int, dim::Int, + Epilogue = SoftMaxForwardEpilogue + ) where T + # This kernel spawns a block per each element in the batch. + # XXX: it assumes that inner_size == 1 + if inner_size == 1 + AccumT = T # TODO calculate output type + ILP = 2 + grid = (outer_size,) + block = getblocksize(ILP, dim_size) + SMemSize = block[1] + gpu_call( + softmax_forward, input, + (output, input, AccumT, UInt32(dim_size), Val{SMemSize}, Val{ILP}, Epilogue), + (grid, block) + ) + # This kernel runs in a 2D grid, where each application along y dimension has a fixed + # outer_size, and runs in parallel over inner_size. Dimension x is parallel over outer_size. + # Reductions over dim are done in a single-threaded manner. + else + # uint32_t smem_size; + # dim3 grid, block; + # SpatialSoftMax_getLaunchSizes( + # state, &cunn_SpatialSoftMaxForward, + # outer_size, dim_size, inner_size, + # grid, block, smem_size + # ); + # + # cunn_SpatialSoftMaxForward + # <<>>( + # output, input, outer_size, dim_size, inner_size + # ) + end +end +using CLArrays +input = CLArray(rand(2, 3, 4, 5, 6)) +output = CLArray(rand(100)) + +HostSoftMaxForward( + input, output, + 30, 4, 6, 3, + SoftMaxForwardEpilogue +) + +input2 = JLArray(Array(input)) +output2 = JLArray(zeros(100)) +HostSoftMaxForward( + input2, output2, + 30, 4, 6, 3, + SoftMaxForwardEpilogue +) +Array(output2) +Array(output)[1:2] + +# m = Transpiler.CLMethod((softmax_forward, +# (CLArrays.KernelState, CLArrays.DeviceArray{Float64,2,Transpiler.CLIntrinsics.GlobalPointer{Float64}}, +# CLArrays.DeviceArray{Float64,2,Transpiler.CLIntrinsics.GlobalPointer{Float64}}, +# Type{Float64}, UInt32, Type{Val{2}}, Type{Val{2}}, Type{SoftMaxForwardEpilogue}) +# )) +# Sugar.getast!(m) +# +# expr = Sugar.sugared(m.signature..., code_typed) +# println(expr) +# expr = Sugar.code_typed(m.signature..., optimize = false)[1][1].code +# typs = Sugar.expr_type.(m, expr[45].args[2].args[2:end]) +# typs +# expr = Sugar.code_typed(arg_ntuple, (typs...), optimize = true) +# Sugar.getast!(Transpiler.CLMethod(typs[1], (typs[2:end]...))) +# arg_ntuple(Val{UInt}) +# Sugar.show_source(STDERR, m, expr)