faster pow, updates for fixed exp(::Float32) variants

JuliaSIMD · Sep 20, 2021 · 47105fe · 47105fe
1 parent d914412
commit 47105fe
Show file tree

Hide file tree

Showing 7 changed files with 117 additions and 52 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,8 +4,7 @@
 *.jl.cov
 *.jl.*.cov
 *.jl.mem
-src/#*#
-tests/#*#
+*#
 *~
 *.mem
 src/svmlwrap.jl

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "SLEEFPirates"
 uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
 authors = ["chriselrod <[email protected]>"]
-version = "0.6.26"
+version = "0.6.27"
 
 [deps]
 IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"

diff --git a/src/misc.jl b/src/misc.jl
@@ -4,31 +4,82 @@
 Exponentiation operator, returns `x` raised to the power `y`.
 """
 @inline function pow(x::V, y::V) where {V <: FloatType}
-    T = eltype(x)
-    yi = unsafe_trunc(fpinttype(T), y)
-    yisint = yi == y
-    yisodd = isodd(yi) & yisint
-    absx = abs(x)
-    logkx = logk(absx)
-
-    logkxy = dmul(logkx, y)
+  T = eltype(x)
+  yi = unsafe_trunc(fpinttype(T), y)
+  yisint = yi == y
+  yisodd = isodd(yi) & yisint
+  absx = abs(x)
+  logkx = logk(absx)
+
+  logkxy = dmul(logkx, y)
+
+  if (VERSION ≥ v"1.6") && (T === Float64)
+    result = exp_hilo(logkxy.hi, logkxy.lo, Val(ℯ), VectorizationBase.has_feature(Val(:x86_64_avx512f)))
+  else
     result = expk(logkxy)
+  end
+
+  result = ifelse(isnan(result), V(Inf), result)
+  result = ifelse(x > 0, result, ifelse(~yisint, V(NaN), ifelse(yisodd, -result, result)))
 
-    result = ifelse(isnan(result), V(Inf), result)
-    result = ifelse(x > 0, result, ifelse(~yisint, V(NaN), ifelse(yisodd, -result, result)))
+  efx = flipsign(abs(x) - one(x), y)
+  # result = ifelse(y == V(Inf), ifelse(efx < 0, V(0.0), ifelse(efx == 0, V(1.0), V(Inf))), result)
+  result = ifelse(isinf(y), ifelse(efx < 0, V(0.0), ifelse(efx == 0, V(1.0), V(Inf))), result)
+  result = ifelse(isinf(x) | (x == 0), ifelse(yisodd, _sign(x), V(1.0)) * ifelse(ifelse(x == 0, -y, y) < 0, V(0.0), V(Inf)), result)
+  result = ifelse(isnan(x) | isnan(y), V(NaN), result)
+  result = ifelse((y == 0) | (x == 1), V(1.0), result)
 
-    efx = flipsign(abs(x) - one(x), y)
-    # result = ifelse(y == V(Inf), ifelse(efx < 0, V(0.0), ifelse(efx == 0, V(1.0), V(Inf))), result)
-    result = ifelse(isinf(y), ifelse(efx < 0, V(0.0), ifelse(efx == 0, V(1.0), V(Inf))), result)
-    result = ifelse(isinf(x) | (x == 0), ifelse(yisodd, _sign(x), V(1.0)) * ifelse(ifelse(x == 0, -y, y) < 0, V(0.0), V(Inf)), result)
-    result = ifelse(isnan(x) | isnan(y), V(NaN), result)
-    result = ifelse((y == 0) | (x == 1), V(1.0), result)
-
-    return result
+  return result
 
 end
 @inline pow_fast(x, y) = exp2(y*log2_fast(x))
 
+const J_TABLE_BASE = Ref(Base.Math.J_TABLE)
+@inline base_exp_table_pointer() = VectorizationBase.stridedpointer(Base.unsafe_convert(Ptr{UInt64}, pointer_from_objref(J_TABLE_BASE)), VectorizationBase.LayoutPointers.StrideIndex{1,(1,),1}((StaticInt{8}(),), (StaticInt{0}(),)))
+
+@inline function exp_hilo(x::Union{Float64,AbstractSIMD{<:Any,Float64}}, xlo::Union{Float64,AbstractSIMD{<:Any,Float64}}, ::Val{B}, ::False) where {B}
+  N_float = muladd(x, VectorizationBase.LogBo256INV(Val{B}(), Float64), VectorizationBase.MAGIC_ROUND_CONST(Float64))
+  N = VectorizationBase.target_trunc(reinterpret(UInt64, N_float))
+  N_float = N_float - VectorizationBase.MAGIC_ROUND_CONST(Float64)
+  r = VectorizationBase.fast_fma(N_float, VectorizationBase.LogBo256U(Val{B}(), Float64), x, fma_fast())
+  r = VectorizationBase.fast_fma(N_float, VectorizationBase.LogBo256L(Val{B}(), Float64), r, fma_fast())
+  # @show (N & 0x000000ff) % Int
+  j = vload(base_exp_table_pointer(), (N & 0x000000ff,))
+  jU = reinterpret(Float64, Base.Math.JU_CONST | (j&Base.Math.JU_MASK))
+  jL = reinterpret(Float64, Base.Math.JL_CONST | (j >> 8))
+  k = N >>> 0x00000008
+  very_small = muladd(jU, VectorizationBase.expm1b_kernel(Val{B}(), r), jL)
+  small_part = muladd(jU, xlo, very_small) + jU
+  # small_part = reinterpret(UInt64, vfmadd(js, expm1b_kernel(Val{B}(), r), js))
+  # return reinterpret(Float64, small_part), r, k, N_float, js
+  twopk = (k % UInt64) << 0x0000000000000034
+  res = reinterpret(Float64, twopk + reinterpret(UInt64, small_part))
+  return res
+end
+@inline function exp_hilo(x::Union{Float64,AbstractSIMD{<:Any,Float64}}, xlo::Union{Float64,AbstractSIMD{<:Any,Float64}}, ::Val{B}, ::True) where {B}
+  N_float = muladd(x, VectorizationBase.LogBo256INV(Val{B}(), Float64), VectorizationBase.MAGIC_ROUND_CONST(Float64))
+  N = VectorizationBase.target_trunc(reinterpret(UInt64, N_float))
+  N_float = N_float - VectorizationBase.MAGIC_ROUND_CONST(Float64)
+  r = fma(N_float, VectorizationBase.LogBo256U(Val{B}(), Float64), x)
+  r = fma(N_float, VectorizationBase.LogBo256L(Val{B}(), Float64), r)
+  # @show (N & 0x000000ff) % Int
+  # @show N N & 0x000000ff
+  j = vload(base_exp_table_pointer(), (N & 0x000000ff,))
+  jU = reinterpret(Float64, Base.Math.JU_CONST | (j&Base.Math.JU_MASK))
+  jL = reinterpret(Float64, Base.Math.JL_CONST | (j >> 8))
+  # @show N & 0x000000ff j jU jL
+  # k = N >>> 0x00000008
+  # small_part = reinterpret(UInt64, vfmadd(js, expm1b_kernel(Val{B}(), r), js))
+  very_small = muladd(jU, VectorizationBase.expm1b_kernel(Val{B}(), r), jL)
+  small_part = muladd(jU, xlo, very_small) + jU
+  # small_part = vfmadd(js, expm1b_kernel(Val{B}(), r), js)
+  # return reinterpret(Float64, small_part), r, k, N_float, js
+  res = VectorizationBase.vscalef(small_part, 0.00390625*N_float)
+  # twopk = (k % UInt64) << 0x0000000000000034
+  # res = reinterpret(Float64, twopk + small_part)
+  return res
+end
+
 
 @inline function cbrt_kernel(x::FloatType64)
   c6 = -0.640245898480692909870982

diff --git a/src/trig.jl b/src/trig.jl
@@ -193,33 +193,46 @@ end
 end
 
 @inline function sin_fast(d::FloatType64)
-    T = eltype(d)
-    I = fpinttype(T)
-    t = d
-
-    qh = trunc(d * (T(M_1_PI) / (1 << 24)))
-    ql = round(d * T(M_1_PI) - qh * (1 << 24))
-
-    d = muladd(qh , -PI_A(T) * (1 << 24) , d)
-    d = muladd(ql , -PI_A(T)             , d)
-    d = muladd(qh , -PI_B(T) * (1 << 24) , d)
-    d = muladd(ql , -PI_B(T)             , d)
-    d = muladd(qh , -PI_C(T) * (1 << 24) , d)
-    d = muladd(ql , -PI_C(T)             , d)
-    d = muladd(qh * (1 << 24) + ql, -PI_D(T), d)
-
-    s = d * d
-
-    qli = unsafe_trunc(fpinttype(T), ql)
-    d = ifelse(qli & one(I) != zero(I), -d, d)
-
-    u = sincos_fast_kernel(s)
-    u = muladd(s, u * d, d)
-
-    # u = ifelse((~isinf(t)) & (isnegzero(t) | (abs(t) > TRIG_MAX(T))), T(-0.0), u)
-    # u = ifelse((~isinf(t)) & ((abs(t) > TRIG_MAX(T))), T(-0.0), u)
-
-    return u
+  T = eltype(d)
+  I = fpinttype(T)
+  t = d
+
+  qh = trunc(d * (T(M_1_PI) / (1 << 24)))
+  ql = round(d * T(M_1_PI) - qh * (1 << 24))
+  # @show qh, ql
+  # qh = trunc(d * (T(M_1_PI) / (1 << 24)))
+  # ql = round(muladd(d, T(M_1_PI), - qh * (1 << 24)))
+  # @show qh, ql
+  # qhb = trunc(big(d) * (T(M_1_PI) / (1 << 24)))
+  # qlb = round(d * T(M_1_PI) - qhb * (1 << 24))
+  # # qh = Float64(qhb); ql = Float64(qlb)
+  # # qh = (qhb); ql = (qlb)
+  # @show qh, ql
+
+  d = muladd(qh , -PI_A(T) * (1 << 24) , d)
+  d = muladd(ql , -PI_A(T)             , d)
+  d = muladd(qh , -PI_B(T) * (1 << 24) , d)
+  d = muladd(ql , -PI_B(T)             , d)
+  d = muladd(qh , -PI_C(T) * (1 << 24) , d)
+  d = muladd(ql , -PI_C(T)             , d)
+  d = muladd(muladd(qh, (1 << 24), ql), -PI_D(T), d)
+  # @show d, qh, ql
+
+  # Nd = round(d*inv(π))
+  # d = muladd(-Float64(π), Nd, d)
+  # d = ifelse(isodd(Base.unsafe_trunc(fpinttype(T), Nd)), -d, d)
+
+  s = d * d
+
+  qli = unsafe_trunc(fpinttype(T), ql)
+  d = ifelse(qli & one(I) != zero(I), -d, d)
+  u = sincos_fast_kernel(s)
+  u = muladd(s, u * d, d)
+
+  # u = ifelse((~isinf(t)) & (isnegzero(t) | (abs(t) > TRIG_MAX(T))), T(-0.0), u)
+  # u = ifelse((~isinf(t)) & ((abs(t) > TRIG_MAX(T))), T(-0.0), u)
+
+  return u
 end
 
 @inline function sin_fast(d::FloatType32)

diff --git a/test/accuracy.jl b/test/accuracy.jl
@@ -7,7 +7,7 @@
     xx = nextfloat(SLEEFPirates.VectorizationBase.MIN_EXP(Val(ℯ),T)):T(0.02):prevfloat(SLEEFPirates.VectorizationBase.MAX_EXP(Val(ℯ),T));
     # xx = T == Float32 ? map(T, vcat(-10:0.0002:10, -50:0.1:50)) : map(T, vcat(-10:0.0002:10, -705:0.1:705))
     fun_table = Dict(SLEEFPirates.exp => Base.exp)
-    tol = 2
+    tol = 3
     test_acc(T, fun_table, xx, tol)
 
 
@@ -132,7 +132,8 @@
     xx3 = map(Tuple{T,T}, [(x,y) for x = 2.1, y = -1000:0.1:1000]);
     txx = vcat(xx1, xx2, xx2);
     fun_table = Dict(SLEEFPirates.pow => Base.:^);
-    tol = 1
+    # tol = 1
+    tol = 3
     test_acc(T, fun_table, txx, tol)
 
     xx1 = map(Tuple{T,T}, [(x,y) for x = 0:0.20:100, y = 0.1:0.20:100])[:];
@@ -168,7 +169,7 @@
     xx = nextfloat(SLEEFPirates.VectorizationBase.MIN_EXP(Val(10),T)):T(0.02):prevfloat(SLEEFPirates.VectorizationBase.MAX_EXP(Val(10),T));
     # xx = map(T, vcat(-10:0.0002:10, -35:0.023:1000, -300:0.01:300))
     fun_table = Dict(SLEEFPirates.exp10 => Base.exp10)
-    tol = 2
+    tol = 3
     test_acc(T, fun_table, xx, tol)
 
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,3 +1,4 @@
+using Aqua
 include("testsetup.jl")
 
 @show Sys.CPU_NAME

diff --git a/test/testsetup.jl b/test/testsetup.jl
@@ -1,5 +1,5 @@
 
-using SLEEFPirates, VectorizationBase, Aqua
+using SLEEFPirates, VectorizationBase
 using VectorizationBase: data
 using Test