Skip to content

Commit

Permalink
faster pow, updates for fixed exp(::Float32) variants
Browse files Browse the repository at this point in the history
  • Loading branch information
chriselrod committed Sep 20, 2021
1 parent d914412 commit 47105fe
Show file tree
Hide file tree
Showing 7 changed files with 117 additions and 52 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
*.jl.cov
*.jl.*.cov
*.jl.mem
src/#*#
tests/#*#
*#
*~
*.mem
src/svmlwrap.jl
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "SLEEFPirates"
uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
authors = ["chriselrod <[email protected]>"]
version = "0.6.26"
version = "0.6.27"

[deps]
IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
Expand Down
87 changes: 69 additions & 18 deletions src/misc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,82 @@
Exponentiation operator, returns `x` raised to the power `y`.
"""
@inline function pow(x::V, y::V) where {V <: FloatType}
T = eltype(x)
yi = unsafe_trunc(fpinttype(T), y)
yisint = yi == y
yisodd = isodd(yi) & yisint
absx = abs(x)
logkx = logk(absx)

logkxy = dmul(logkx, y)
T = eltype(x)
yi = unsafe_trunc(fpinttype(T), y)
yisint = yi == y
yisodd = isodd(yi) & yisint
absx = abs(x)
logkx = logk(absx)

logkxy = dmul(logkx, y)

if (VERSION v"1.6") && (T === Float64)
result = exp_hilo(logkxy.hi, logkxy.lo, Val(ℯ), VectorizationBase.has_feature(Val(:x86_64_avx512f)))
else
result = expk(logkxy)
end

result = ifelse(isnan(result), V(Inf), result)
result = ifelse(x > 0, result, ifelse(~yisint, V(NaN), ifelse(yisodd, -result, result)))

result = ifelse(isnan(result), V(Inf), result)
result = ifelse(x > 0, result, ifelse(~yisint, V(NaN), ifelse(yisodd, -result, result)))
efx = flipsign(abs(x) - one(x), y)
# result = ifelse(y == V(Inf), ifelse(efx < 0, V(0.0), ifelse(efx == 0, V(1.0), V(Inf))), result)
result = ifelse(isinf(y), ifelse(efx < 0, V(0.0), ifelse(efx == 0, V(1.0), V(Inf))), result)
result = ifelse(isinf(x) | (x == 0), ifelse(yisodd, _sign(x), V(1.0)) * ifelse(ifelse(x == 0, -y, y) < 0, V(0.0), V(Inf)), result)
result = ifelse(isnan(x) | isnan(y), V(NaN), result)
result = ifelse((y == 0) | (x == 1), V(1.0), result)

efx = flipsign(abs(x) - one(x), y)
# result = ifelse(y == V(Inf), ifelse(efx < 0, V(0.0), ifelse(efx == 0, V(1.0), V(Inf))), result)
result = ifelse(isinf(y), ifelse(efx < 0, V(0.0), ifelse(efx == 0, V(1.0), V(Inf))), result)
result = ifelse(isinf(x) | (x == 0), ifelse(yisodd, _sign(x), V(1.0)) * ifelse(ifelse(x == 0, -y, y) < 0, V(0.0), V(Inf)), result)
result = ifelse(isnan(x) | isnan(y), V(NaN), result)
result = ifelse((y == 0) | (x == 1), V(1.0), result)

return result
return result

end
@inline pow_fast(x, y) = exp2(y*log2_fast(x))

const J_TABLE_BASE = Ref(Base.Math.J_TABLE)
@inline base_exp_table_pointer() = VectorizationBase.stridedpointer(Base.unsafe_convert(Ptr{UInt64}, pointer_from_objref(J_TABLE_BASE)), VectorizationBase.LayoutPointers.StrideIndex{1,(1,),1}((StaticInt{8}(),), (StaticInt{0}(),)))

@inline function exp_hilo(x::Union{Float64,AbstractSIMD{<:Any,Float64}}, xlo::Union{Float64,AbstractSIMD{<:Any,Float64}}, ::Val{B}, ::False) where {B}
N_float = muladd(x, VectorizationBase.LogBo256INV(Val{B}(), Float64), VectorizationBase.MAGIC_ROUND_CONST(Float64))
N = VectorizationBase.target_trunc(reinterpret(UInt64, N_float))
N_float = N_float - VectorizationBase.MAGIC_ROUND_CONST(Float64)
r = VectorizationBase.fast_fma(N_float, VectorizationBase.LogBo256U(Val{B}(), Float64), x, fma_fast())
r = VectorizationBase.fast_fma(N_float, VectorizationBase.LogBo256L(Val{B}(), Float64), r, fma_fast())
# @show (N & 0x000000ff) % Int
j = vload(base_exp_table_pointer(), (N & 0x000000ff,))
jU = reinterpret(Float64, Base.Math.JU_CONST | (j&Base.Math.JU_MASK))
jL = reinterpret(Float64, Base.Math.JL_CONST | (j >> 8))
k = N >>> 0x00000008
very_small = muladd(jU, VectorizationBase.expm1b_kernel(Val{B}(), r), jL)
small_part = muladd(jU, xlo, very_small) + jU
# small_part = reinterpret(UInt64, vfmadd(js, expm1b_kernel(Val{B}(), r), js))
# return reinterpret(Float64, small_part), r, k, N_float, js
twopk = (k % UInt64) << 0x0000000000000034
res = reinterpret(Float64, twopk + reinterpret(UInt64, small_part))
return res
end
@inline function exp_hilo(x::Union{Float64,AbstractSIMD{<:Any,Float64}}, xlo::Union{Float64,AbstractSIMD{<:Any,Float64}}, ::Val{B}, ::True) where {B}
N_float = muladd(x, VectorizationBase.LogBo256INV(Val{B}(), Float64), VectorizationBase.MAGIC_ROUND_CONST(Float64))
N = VectorizationBase.target_trunc(reinterpret(UInt64, N_float))
N_float = N_float - VectorizationBase.MAGIC_ROUND_CONST(Float64)
r = fma(N_float, VectorizationBase.LogBo256U(Val{B}(), Float64), x)
r = fma(N_float, VectorizationBase.LogBo256L(Val{B}(), Float64), r)
# @show (N & 0x000000ff) % Int
# @show N N & 0x000000ff
j = vload(base_exp_table_pointer(), (N & 0x000000ff,))
jU = reinterpret(Float64, Base.Math.JU_CONST | (j&Base.Math.JU_MASK))
jL = reinterpret(Float64, Base.Math.JL_CONST | (j >> 8))
# @show N & 0x000000ff j jU jL
# k = N >>> 0x00000008
# small_part = reinterpret(UInt64, vfmadd(js, expm1b_kernel(Val{B}(), r), js))
very_small = muladd(jU, VectorizationBase.expm1b_kernel(Val{B}(), r), jL)
small_part = muladd(jU, xlo, very_small) + jU
# small_part = vfmadd(js, expm1b_kernel(Val{B}(), r), js)
# return reinterpret(Float64, small_part), r, k, N_float, js
res = VectorizationBase.vscalef(small_part, 0.00390625*N_float)
# twopk = (k % UInt64) << 0x0000000000000034
# res = reinterpret(Float64, twopk + small_part)
return res
end


@inline function cbrt_kernel(x::FloatType64)
c6 = -0.640245898480692909870982
Expand Down
67 changes: 40 additions & 27 deletions src/trig.jl
Original file line number Diff line number Diff line change
Expand Up @@ -193,33 +193,46 @@ end
end

@inline function sin_fast(d::FloatType64)
T = eltype(d)
I = fpinttype(T)
t = d

qh = trunc(d * (T(M_1_PI) / (1 << 24)))
ql = round(d * T(M_1_PI) - qh * (1 << 24))

d = muladd(qh , -PI_A(T) * (1 << 24) , d)
d = muladd(ql , -PI_A(T) , d)
d = muladd(qh , -PI_B(T) * (1 << 24) , d)
d = muladd(ql , -PI_B(T) , d)
d = muladd(qh , -PI_C(T) * (1 << 24) , d)
d = muladd(ql , -PI_C(T) , d)
d = muladd(qh * (1 << 24) + ql, -PI_D(T), d)

s = d * d

qli = unsafe_trunc(fpinttype(T), ql)
d = ifelse(qli & one(I) != zero(I), -d, d)

u = sincos_fast_kernel(s)
u = muladd(s, u * d, d)

# u = ifelse((~isinf(t)) & (isnegzero(t) | (abs(t) > TRIG_MAX(T))), T(-0.0), u)
# u = ifelse((~isinf(t)) & ((abs(t) > TRIG_MAX(T))), T(-0.0), u)

return u
T = eltype(d)
I = fpinttype(T)
t = d

qh = trunc(d * (T(M_1_PI) / (1 << 24)))
ql = round(d * T(M_1_PI) - qh * (1 << 24))
# @show qh, ql
# qh = trunc(d * (T(M_1_PI) / (1 << 24)))
# ql = round(muladd(d, T(M_1_PI), - qh * (1 << 24)))
# @show qh, ql
# qhb = trunc(big(d) * (T(M_1_PI) / (1 << 24)))
# qlb = round(d * T(M_1_PI) - qhb * (1 << 24))
# # qh = Float64(qhb); ql = Float64(qlb)
# # qh = (qhb); ql = (qlb)
# @show qh, ql

d = muladd(qh , -PI_A(T) * (1 << 24) , d)
d = muladd(ql , -PI_A(T) , d)
d = muladd(qh , -PI_B(T) * (1 << 24) , d)
d = muladd(ql , -PI_B(T) , d)
d = muladd(qh , -PI_C(T) * (1 << 24) , d)
d = muladd(ql , -PI_C(T) , d)
d = muladd(muladd(qh, (1 << 24), ql), -PI_D(T), d)
# @show d, qh, ql

# Nd = round(d*inv(π))
# d = muladd(-Float64(π), Nd, d)
# d = ifelse(isodd(Base.unsafe_trunc(fpinttype(T), Nd)), -d, d)

s = d * d

qli = unsafe_trunc(fpinttype(T), ql)
d = ifelse(qli & one(I) != zero(I), -d, d)
u = sincos_fast_kernel(s)
u = muladd(s, u * d, d)

# u = ifelse((~isinf(t)) & (isnegzero(t) | (abs(t) > TRIG_MAX(T))), T(-0.0), u)
# u = ifelse((~isinf(t)) & ((abs(t) > TRIG_MAX(T))), T(-0.0), u)

return u
end

@inline function sin_fast(d::FloatType32)
Expand Down
7 changes: 4 additions & 3 deletions test/accuracy.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
xx = nextfloat(SLEEFPirates.VectorizationBase.MIN_EXP(Val(ℯ),T)):T(0.02):prevfloat(SLEEFPirates.VectorizationBase.MAX_EXP(Val(ℯ),T));
# xx = T == Float32 ? map(T, vcat(-10:0.0002:10, -50:0.1:50)) : map(T, vcat(-10:0.0002:10, -705:0.1:705))
fun_table = Dict(SLEEFPirates.exp => Base.exp)
tol = 2
tol = 3
test_acc(T, fun_table, xx, tol)


Expand Down Expand Up @@ -132,7 +132,8 @@
xx3 = map(Tuple{T,T}, [(x,y) for x = 2.1, y = -1000:0.1:1000]);
txx = vcat(xx1, xx2, xx2);
fun_table = Dict(SLEEFPirates.pow => Base.:^);
tol = 1
# tol = 1
tol = 3
test_acc(T, fun_table, txx, tol)

xx1 = map(Tuple{T,T}, [(x,y) for x = 0:0.20:100, y = 0.1:0.20:100])[:];
Expand Down Expand Up @@ -168,7 +169,7 @@
xx = nextfloat(SLEEFPirates.VectorizationBase.MIN_EXP(Val(10),T)):T(0.02):prevfloat(SLEEFPirates.VectorizationBase.MAX_EXP(Val(10),T));
# xx = map(T, vcat(-10:0.0002:10, -35:0.023:1000, -300:0.01:300))
fun_table = Dict(SLEEFPirates.exp10 => Base.exp10)
tol = 2
tol = 3
test_acc(T, fun_table, xx, tol)


Expand Down
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using Aqua
include("testsetup.jl")

@show Sys.CPU_NAME
Expand Down
2 changes: 1 addition & 1 deletion test/testsetup.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

using SLEEFPirates, VectorizationBase, Aqua
using SLEEFPirates, VectorizationBase
using VectorizationBase: data
using Test

Expand Down

0 comments on commit 47105fe

Please sign in to comment.