Skip to content

Commit

Permalink
Less metaprogramming
Browse files Browse the repository at this point in the history
  • Loading branch information
chriselrod committed May 23, 2021
1 parent 014f7be commit 266ddee
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 118 deletions.
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
name = "SLEEFPirates"
uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
authors = ["chriselrod <[email protected]>"]
version = "0.6.19"
version = "0.6.20"

[deps]
IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"

[compat]
IfElse = "0.1"
Static = "0.2"
VectorizationBase = "0.19.37, 0.20"
julia = "1.5"

Expand Down
7 changes: 3 additions & 4 deletions src/SLEEFPirates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@ module SLEEFPirates
using Base: llvmcall
using Base.Math: uinttype, exponent_bias, exponent_mask, significand_bits, IEEEFloat, exponent_raw_max

using Libdl, VectorizationBase
using VectorizationBase
using Static: True, False, One, lt, StaticInt

using VectorizationBase: vzero, AbstractSIMD, _Vec, fma_fast, data, VecUnroll, NativeTypes, FloatingTypes, vIEEEFloat,
vfmadd, vfnmadd, vfmsub, vfnmsub, True, False, One,
Double, dadd, dadd2, dsub, dsub2, dmul, dsqu, dsqrt, ddiv, drec, scale,
dnormalize
vfmadd, vfnmadd, vfmsub, vfnmsub, Double, dadd, dadd2, dsub, dsub2, dmul, dsqu, dsqrt, ddiv, drec, scale, dnormalize


import IfElse: ifelse
Expand Down
138 changes: 80 additions & 58 deletions src/estrin.jl
Original file line number Diff line number Diff line change
@@ -1,51 +1,51 @@

@generated function estrin(x, p::NTuple{N}) where {N}
# N = length(p)
# log2N = VectorizationBase.intlog2(N)
ex = Expr(:block, Expr(:meta, :inline))
Nfrac1 = N >> 1
nextp = :p_1_
nextx = :x_1
Nfrac1 > 1 && push!(ex.args, :( $nextx = x * x ))
for n 1:Nfrac1
push!(ex.args, :( $(Symbol(nextp,n)) = muladd(p[$(2n)],x,p[$(2n-1)]) ) )
end
oddNfrac = isodd(N)
if oddNfrac
Nfrac1 += 1
push!(ex.args, :( $(Symbol(nextp,Nfrac1)) = p[$N] ))
end

lastp = nextp
lastx = nextx
depth = 1
while Nfrac1 > 1
# while Nfrac1 > 1
oddNfrac = isodd(Nfrac1)
Nfrac1 >>= 1
depth += 1
nextp = Symbol(:p_, depth, :_)
nextx = Symbol(:x_, depth)
(Nfrac1 > 1 || oddNfrac) && push!(ex.args, :( $nextx = $lastx * $lastx ))
for n 1:Nfrac1
np = Symbol(nextp,n)
# @show lastp, n
lpu = Symbol(lastp,2n)
lpl = Symbol(lastp,2n-1)
push!(ex.args, :( $np = muladd($lpu,$lastx,$lpl) ) )
end
if oddNfrac
Nfrac1 += 1
push!(ex.args, :( $(Symbol(nextp,Nfrac1)) = $(Symbol(lastp,2Nfrac1-1)) ))
end
# @generated function estrin(x, p::NTuple{N}) where {N}
# # N = length(p)
# # log2N = VectorizationBase.intlog2(N)
# ex = Expr(:block, Expr(:meta, :inline))
# Nfrac1 = N >> 1
# nextp = :p_1_
# nextx = :x_1
# Nfrac1 > 1 && push!(ex.args, :( $nextx = Base.FastMath.mul_fast(x, x)))
# for n ∈ 1:Nfrac1
# push!(ex.args, :( $(Symbol(nextp,n)) = muladd(p[$(2n)],x,p[$(2n-1)]) ) )
# end
# oddNfrac = isodd(N)
# if oddNfrac
# Nfrac1 += 1
# push!(ex.args, :( $(Symbol(nextp,Nfrac1)) = p[$N] ))
# end

lastp = nextp
lastx = nextx
end
ex
end
# lastp = nextp
# lastx = nextx
# depth = 1
# while Nfrac1 > 1
# # while Nfrac1 > 1
# oddNfrac = isodd(Nfrac1)
# Nfrac1 >>= 1
# depth += 1
# nextp = Symbol(:p_, depth, :_)
# nextx = Symbol(:x_, depth)
# (Nfrac1 > 1 || oddNfrac) && push!(ex.args, :( $nextx = Base.FastMath.mul_fast($lastx, $lastx)))
# for n ∈ 1:Nfrac1
# np = Symbol(nextp,n)
# # @show lastp, n
# lpu = Symbol(lastp,2n)
# lpl = Symbol(lastp,2n-1)
# push!(ex.args, :( $np = muladd($lpu,$lastx,$lpl) ) )
# end
# if oddNfrac
# Nfrac1 += 1
# push!(ex.args, :( $(Symbol(nextp,Nfrac1)) = $(Symbol(lastp,2Nfrac1-1)) ))
# end

# lastp = nextp
# lastx = nextx
# end
# ex
# end
# Given a `VecUnroll` argument, we'll instruction level parallelism that way and can thus forgo `estrin`
@inline estrin(x::VecUnroll, p::NTuple{N}) where {N} = evalpoly(x, p)

# macro estrin(x, p...)
# t = Expr(:tuple); foreach(pᵢ -> push!(t.args, pᵢ), p)
Expand All @@ -56,22 +56,44 @@ end
# end
# end

macro horner(x, p...)
N = length(p)
ex = Expr(:call, :muladd, p[N], x, p[N-1])
for n 2:N-1
ex = Expr(:call, :muladd, ex, x, p[N-n])
end
esc(ex)
end
# macro horner(x, p...)
# N = length(p)
# ex = Expr(:call, :muladd, p[N], x, p[N-1])
# for n ∈ 2:N-1
# ex = Expr(:call, :muladd, ex, x, p[N-n])
# end
# esc(ex)
# end

@generated function evalpoly(x, p::Tuple{Vararg{Any,N}}) where {N}
ex = Expr(:call, :muladd, Expr(:ref, :p, N), :x, Expr(:ref, :p, N-1))
for n 2:N-1
ex = Expr(:call, :muladd, ex, :x, Expr(:ref, :p, N - n))
end
Expr(:block, Expr(:meta, :inline), ex)
@inline evalpoly(x, p::Tuple{T1}) where {T1} = only(p)
@inline evalpoly(x, p::Tuple{T1,T2}) where {T1,T2} = muladd(p[2], x, p[1])
@inline evalpoly(x, p::Tuple{T1,T2,T3,Vararg{Any,N}}) where {T1,T2,T3,N} = evalpoly(x, (ntuple(n -> p[n], Val(N+1))..., muladd(p[end], x, p[end-1])))

@inline estrin(x::VecUnroll, p::NTuple{N}) where {N} = evalpoly(x, p)
@inline estrin(x, p::Tuple{Vararg{Any,N}}) where {N} = estrin(x, p, StaticInt{N}() & StaticInt{3}(), lt(StaticInt{N}(), StaticInt(7)))
@inline estrin(x, p, r, ::True) = evalpoly(x, p)
@inline function estrin(x, p::Tuple{Vararg{Any,N}}, ::StaticInt{0}, ::False) where {N}
x2 = Base.FastMath.mul_fast(x, x)
x4 = Base.FastMath.mul_fast(x2, x2)
res = muladd(x2, muladd(x, p[end], p[end-1]), muladd(x, p[end-2], p[end-3]))
return _estrin(x, x2, x4, res, ntuple(n -> p[n], Val(N-4)))
end
@inline function estrin(x, p::Tuple{Vararg{Any,N}}, ::StaticInt{R}, ::False) where {N,R}
x2 = Base.FastMath.mul_fast(x, x)
x4 = Base.FastMath.mul_fast(x2, x2)
res = evalpoly(x, p[N-R+1:N])
return _estrin(x, x2, x4, res, ntuple(n -> p[n], Val(N-R)))
end
@inline function __estrin(x, x2, x4, ex, p1, p2, p3, p4)
part = muladd(x2, muladd(x, p4, p3), muladd(x, p2, p1))
muladd(x4, ex, part)
end
@inline _estrin(x, x2, x4, ex, p::Tuple{}) = ex
@inline function _estrin(x, x2, x4, ex, p::Tuple{T1,T2,T3,T4,Vararg{Any,N}}) where {T1,T2,T3,T4,N}
ex = _estrin(x, x2, x4, ex, ntuple(n -> p[n+4], Val(N)))
__estrin(x, x2, x4, ex, p[1], p[2], p[3], p[4])
end


# OscardSmith
# https://github.com/JuliaLang/julia/blob/3253fb5a60ad841965eb6bd218921d55101c0842/base/special/expm1.jl
Expand Down
47 changes: 22 additions & 25 deletions src/log.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,23 +94,21 @@ end


@inline function log_kernel(x::FloatType64)
c7 = 0.1532076988502701353
c6 = 0.1525629051003428716
c5 = 0.1818605932937785996
c4 = 0.2222214519839380009
c3 = 0.2857142932794299317
c2 = 0.3999999999635251990
c1 = 0.6666666666667333541
# return @horner x c1 c2 c3 c4 c5 c6 c7
@horner x c1 c2 c3 c4 c5 c6 c7
c7 = 0.1532076988502701353
c6 = 0.1525629051003428716
c5 = 0.1818605932937785996
c4 = 0.2222214519839380009
c3 = 0.2857142932794299317
c2 = 0.3999999999635251990
c1 = 0.6666666666667333541
evalpoly(x, (c1, c2, c3, c4, c5, c6, c7))
end

@inline function log_kernel(x::FloatType32)
c3 = 0.3027294874f0
c2 = 0.3996108174f0
c1 = 0.6666694880f0
# return @horner x c1 c2 c3
@horner x c1 c2 c3
c3 = 0.3027294874f0
c2 = 0.3996108174f0
c1 = 0.6666694880f0
evalpoly(x, (c1, c2, c3))
end

"""
Expand Down Expand Up @@ -162,8 +160,7 @@ end
# c3 = 0.399999999950799600689777
# c2 = 0.6666666666667778740063
# c1 = 2.0
# # return @horner x c1 c2 c3 c4 c5 c6 c7 c8
# @horner x c1 c2 c3 c4 c5 c6 c7 c8
# evalpoly(x, (c1, c2, c3, c4, c5, c6, c7, c8))
# end
@inline function log_fast_kernel(::Val{ℯ}, x::FloatType64)
c1 = 1.999999999999999972288314133660764626502058106566467649677782135685758742482496766144698785351773083811113458922204637526163881038549205101773551870109094623210762917254877090109553039007279469903453824801461040391573804774824166566643949526292524754553263795026695443159712389662581644078252968607010603610968
Expand All @@ -175,8 +172,7 @@ end
c7 = 0.1524699883651617684758546885725609890936381489527976937281307452249553913565153843250171402249265467319884679549763243177823555571966975970148692848138108033225746264903112492670940278404127063267504949155893965147469747805989675713477911533708565646356998159446756108541482683919621746752450341943950918560444
c8 = 0.1538953296978747663739188355461749838905796367288249763220945925738162292410415558446630046977678608900931196974342765569666836704427957456516824211567717339130678264230162068823354062924440216282160834915006269368499669546218116339647782657742158043373280480925865755011661730676024105585113850596556788214285

# return @horner x c1 c2 c3 c4 c5 c6 c7 c8
@horner x c1 c2 c3 c4 c5 c6 c7 c8
evalpoly(x, (c1, c2, c3, c4, c5, c6, c7, c8))
end
@inline function log_fast_kernel(::Val{2}, x::FloatType64)
c1 = 2.885390081777926774740337587963391752686470697574765996133470083444891120861519512296991622220723412089079177191231548172714708142318482235816576480680610952734785448442320447132165628681849517673133529587846690979418974880507488664900789156089097059610996054569326290084125678471502654166548446893122338532023
Expand All @@ -187,8 +183,8 @@ end
c6 = 0.2623764049675897374399611828075795820528035043402834678195862228934527298347670603110363990323094620293616051801837703306154765947894231616030376001347124009763901282915858969582585714993674594044171879858797430669924758656504158059304709147122269898951359911582928703927632984965613508778815160634440821006823
c7 = 0.2199676960988168325549661966995124554567674873746539020720831896109731813706126058088861721617649492103067104363793480846581954569700692779507905838002757047383852895766988734919935420040379293796738154168364864473763605523658805065039735690708281857109533843972417256743512773874125120066398926457479452994093
c8 = 0.2220240289710959406538132498293188569978091951805264791950528071212183891654240675907318516066736410222500677207442838638375330590597400913020365613995393179596880082388555123835749955456521729694538056168216831175849919088444866170319155410834510245750708062346271728171344176144233493167586972292956621704167
# return @horner x c1 c2 c3 c4 c5 c6 c7 c8
@horner x c1 c2 c3 c4 c5 c6 c7 c8

evalpoly(x, (c1, c2, c3, c4, c5, c6, c7, c8))
end

@inline function log_fast_kernel(::Val{10}, x::FloatType64)
Expand All @@ -200,7 +196,8 @@ end
c6 = 0.07898316804972451278986704059544004657534078944771282690278469570981935772429141983258957479757541438522692317478446375357141436448959705067029890831066506672402437452874301364583400492390678050347427582721177656604136026871846918722378134086776956936760280411925008797490847304302379623068922461071133054013859
c7 = 0.06621687460284276437404220222283889587447460505118960613559060231236406623915686648073020335416144993466970222767859395725920508224420109118449188766790585839731181487203895006109028946170265819774012459305958251419929926567321073232894818683941128265790381145874410375046038091980215817264367754587024710978986
c8 = 0.0668358924784686462819357730165824743917341557202951962573347574755028338084519992412011915655131252999581138520558248266369770803727027591714823269708741953473958951570623295145882252663691038603224938441400683179259967167971897078905444705976854175284181780118155805554122865814034985253134445374766613739187
@horner x c1 c2 c3 c4 c5 c6 c7 c8

evalpoly(x, (c1, c2, c3, c4, c5, c6, c7, c8))
end

@inline function log_fast_kernel(::Val{ℯ}, x::FloatType32)
Expand All @@ -209,24 +206,24 @@ end
c3 = 0.400005877017974853515625f0
c2 = 0.666666686534881591796875f0
c1 = 2f0
# return @horner x c1 c2 c3 c4 c5
@horner x c1 c2 c3 c4 c5

evalpoly(x, (c1, c2, c3, c4, c5))
end
@inline function log_fast_kernel(::Val{2}, x::FloatType32)
c1 = 2.88539008183608973931111465797605163855459400351399679091670150619994307807932305709230255880455898882523001788391899491983367652635183189227843545539636871821101647200186975131915853092890583610850196033734515715697697545252303451919374844119991942911339078566733293749421229056798365084549286067828816391335f0
c2 = 0.9617966217191049631079511471983452477766855280727822803767276536378636410044422070689420475579087786542928086817468309854736049531273903145228084274670593651217615221379183203388034928285734826591357927980847901606338440032331555302786596797592993793099171114989291012653957044107057443267095126328001276549215f0
c3 = 0.577092346895436124496418053994048368787454897109979847565300121654461075967303615264336356613865144091476468885553992850165141154633791757163879758533181288454326796486333639237352641170689759463369795606769823631738673348747856212036970422229498298065338996708007695849071708840167941742938607388858010144124f0
c4 = 0.4112062774884430872292144102146583634128259952701992109373810141836243377352557453727961104893802593899034166502781198307239642169574364520342372597806296006723519823066460247328403371685464127344456586539860933393102297655759579144420040193328399480261756827846184101221968710775674920677966868480884154531906f0
c5 = 0.3483929578463614140008807858571049228792841000218239434906676609692043222435549289484194004801943792995337270459610071866199644093805754623761881367467842177198732930226925274643266617665940920235915835781226498423330347120803047035104419586882902933922994002620011702649787056467119273317636032117141887959942f0
@horner x c1 c2 c3 c4 c5
evalpoly(x, (c1, c2, c3, c4, c5))
end
@inline function log_fast_kernel(::Val{10}, x::FloatType32)
c1 = 0.8685889638240124402397708951106502262202360135356954454698763954975124221040424849066825167876820875325127784493624787542409796165383940126454679616909814259580253262132903371754228716156380963198674619876749182361498397961445142358261199400985591909478187454632084248453424851342102079761821940780603198545616f0
c2 = 0.2895296328657339288904493329122032697541659756446158536870312153495414932533066234388735996180213425285285960320369659124149855341013308711222268203432308124686350895775277211606241842893767238421488611650069250184273261408780106508058139758507081550308729508346041832759467694040676864256279697131089631717099f0
c3 = 0.1737221066836498683203094026804327752209318893377756731259494918401339926199681736692287989621095193583635545978137901741612449570980653896864502278390604987366493020260231450208306389176678539877072993190077953368719548863200253103843475249767868540737686876012511040200964468493328048981851042321833775224372f0
c4 = 0.1237854239293478707125637927629425829204203853315914667487437505152964085560979536798787545674711702014151928256831535140556118705532892713919312267005970112932094938487205764203670341705365514082680822781583742792219266327012272008036123492184419119821962482310107701115807828778871264690149628141482055104599f0
c5 = 0.1048767305898517597797548767722330999824230472712341470034472461777359592830413739746852188570789846396677304208657150772765766584594750277172294014192856723183380665073137335352084472870754706334151185602395485903693315152058182538357857178779149581856966210549355606555462782599244992615178411271029676590258f0
@horner x c1 c2 c3 c4 c5
evalpoly(x, (c1, c2, c3, c4, c5))
end

"""
Expand Down
28 changes: 14 additions & 14 deletions src/misc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,22 @@ end


@inline function cbrt_kernel(x::FloatType64)
c6d = -0.640245898480692909870982
c5d = 2.96155103020039511818595
c4d = -5.73353060922947843636166
c3d = 6.03990368989458747961407
c2d = -3.85841935510444988821632
c1d = 2.2307275302496609725722
@horner x c1d c2d c3d c4d c5d c6d
c6 = -0.640245898480692909870982
c5 = 2.96155103020039511818595
c4 = -5.73353060922947843636166
c3 = 6.03990368989458747961407
c2 = -3.85841935510444988821632
c1 = 2.2307275302496609725722
evalpoly(x, (c1, c2, c3, c4, c5, c6))
end
@inline function cbrt_kernel(x::FloatType32)
c6f = -0.601564466953277587890625f0
c5f = 2.8208892345428466796875f0
c4f = -5.532182216644287109375f0
c3f = 5.898262500762939453125f0
c2f = -3.8095417022705078125f0
c1f = 2.2241256237030029296875f0
@horner x c1f c2f c3f c4f c5f c6f
c6 = -0.601564466953277587890625f0
c5 = 2.8208892345428466796875f0
c4 = -5.532182216644287109375f0
c3 = 5.898262500762939453125f0
c2 = -3.8095417022705078125f0
c1 = 2.2241256237030029296875f0
evalpoly(x, (c1, c2, c3, c4, c5, c6))
end
"""
Algorithm:
Expand Down
14 changes: 7 additions & 7 deletions src/priv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -256,13 +256,13 @@ const under_expk(::Type{Float32}) = -104f0
return estrin(x, (c1, c2, c3, c4, c5, c6, c7, c8, c9, c10))
end

@inline function expk_kernel(x::FloatType32)
@inline function expk_kernel(x::FloatType32)
c5 = 0.00136324646882712841033936f0
c4 = 0.00836596917361021041870117f0
c3 = 0.0416710823774337768554688f0
c2 = 0.166665524244308471679688f0
c1 = 0.499999850988388061523438f0
return @horner x c1 c2 c3 c4 c5
return evalpoly(x, (c1, c2, c3, c4, c5))
end

@inline function expk(d::Double{V}) where {V <: vIEEEFloat}
Expand Down Expand Up @@ -303,13 +303,13 @@ end
return dadd(dmul(x, u), c1)
end

@inline function expk2_kernel(x::Double{<:FloatType32})
@inline function expk2_kernel(x::Double{<:FloatType32})
c5 = 0.1980960224f-3
c4 = 0.1394256484f-2
c3 = 0.8333456703f-2
c2 = 0.4166637361f-1
c1 = 0.166666659414234244790680580464f0
u = @horner x.hi c2 c3 c4 c5
u = evalpoly(x.hi, (c2, c3, c4, c5))
return dadd(dmul(x, u), c1)
end

Expand Down Expand Up @@ -343,15 +343,15 @@ end
c3 = 0.285714285511134091777308
c2 = 0.400000000000914013309483
c1 = 0.666666666666664853302393
return @horner x c1 c2 c3 c4 c5 c6 c7 c8
return evalpoly(x, (c1, c2, c3, c4, c5, c6, c7, c8))
end

@inline function logk2_kernel(x::FloatType32)
c4 = 0.240320354700088500976562f0
c3 = 0.285112679004669189453125f0
c2 = 0.400007992982864379882812f0
c1 = 0.666666686534881591796875f0
return @horner x c1 c2 c3 c4
return evalpoly(x, (c1, c2, c3, c4))
end

@inline function logk2(d::Double{V}) where {V <: vIEEEFloat}
Expand Down Expand Up @@ -393,7 +393,7 @@ end
c3 = 0.285112679004669189453125f0
c2 = 0.400007992982864379882812f0
c1 = Double(0.66666662693023681640625f0, 3.69183861259614332084311f-9)
dadd(dmul(x, @horner x.hi c2 c3 c4), c1)
dadd(dmul(x, evalpoly(x.hi, (c2, c3, c4))), c1)
end

logkmul(::Type{Float64}) = 1.8446744073709551616e19
Expand Down
Loading

2 comments on commit 266ddee

@chriselrod
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/37343

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.6.20 -m "<description of version>" 266ddee58287f118731e020cba56960947ec09d7
git push origin v0.6.20

Please sign in to comment.