From 266ddee58287f118731e020cba56960947ec09d7 Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Sun, 23 May 2021 02:42:02 -0400
Subject: [PATCH] Less metaprogramming

---
 Project.toml        |   5 +-
 src/SLEEFPirates.jl |   7 +--
 src/estrin.jl       | 138 +++++++++++++++++++++++++-------------------
 src/log.jl          |  47 +++++++--------
 src/misc.jl         |  28 ++++-----
 src/priv.jl         |  14 ++---
 src/trig.jl         |  16 ++---
 7 files changed, 137 insertions(+), 118 deletions(-)

diff --git a/Project.toml b/Project.toml
index 0293fa1..5ce71d9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,15 +1,16 @@
 name = "SLEEFPirates"
 uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
 authors = ["chriselrod <elrodc@gmail.com>"]
-version = "0.6.19"
+version = "0.6.20"
 
 [deps]
 IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
-Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 IfElse = "0.1"
+Static = "0.2"
 VectorizationBase = "0.19.37, 0.20"
 julia = "1.5"
 
diff --git a/src/SLEEFPirates.jl b/src/SLEEFPirates.jl
index ef85105..81cc033 100644
--- a/src/SLEEFPirates.jl
+++ b/src/SLEEFPirates.jl
@@ -3,12 +3,11 @@ module SLEEFPirates
 using Base: llvmcall
 using Base.Math: uinttype, exponent_bias, exponent_mask, significand_bits, IEEEFloat, exponent_raw_max
 
-using Libdl, VectorizationBase
+using VectorizationBase
+using Static: True, False, One, lt, StaticInt
 
 using VectorizationBase: vzero, AbstractSIMD, _Vec, fma_fast, data, VecUnroll, NativeTypes, FloatingTypes, vIEEEFloat,
-    vfmadd, vfnmadd, vfmsub, vfnmsub, True, False, One,
-    Double, dadd, dadd2, dsub, dsub2, dmul, dsqu, dsqrt, ddiv, drec, scale,
-    dnormalize
+    vfmadd, vfnmadd, vfmsub, vfnmsub, Double, dadd, dadd2, dsub, dsub2, dmul, dsqu, dsqrt, ddiv, drec, scale, dnormalize
 
 
 import IfElse: ifelse
diff --git a/src/estrin.jl b/src/estrin.jl
index 76c83b3..bb3f771 100644
--- a/src/estrin.jl
+++ b/src/estrin.jl
@@ -1,51 +1,51 @@
 
-@generated function estrin(x, p::NTuple{N}) where {N}
-    # N = length(p)
-    # log2N = VectorizationBase.intlog2(N)
-    ex = Expr(:block, Expr(:meta, :inline))
-    Nfrac1 = N >> 1
-    nextp = :p_1_
-    nextx = :x_1
-    Nfrac1 > 1 && push!(ex.args, :( $nextx = x * x ))
-    for n ∈ 1:Nfrac1
-        push!(ex.args, :( $(Symbol(nextp,n)) = muladd(p[$(2n)],x,p[$(2n-1)]) ) )
-    end
-    oddNfrac = isodd(N)
-    if oddNfrac
-        Nfrac1 += 1
-        push!(ex.args, :( $(Symbol(nextp,Nfrac1)) = p[$N]  ))
-    end
 
-    lastp = nextp
-    lastx = nextx
-    depth = 1
-    while Nfrac1 > 1
-    # while Nfrac1 > 1
-        oddNfrac = isodd(Nfrac1)
-        Nfrac1 >>= 1
-        depth += 1
-        nextp = Symbol(:p_, depth, :_)
-        nextx = Symbol(:x_, depth)
-        (Nfrac1 > 1 || oddNfrac) && push!(ex.args, :( $nextx = $lastx * $lastx ))
-        for n ∈ 1:Nfrac1
-            np = Symbol(nextp,n)
-            # @show lastp, n
-            lpu = Symbol(lastp,2n)
-            lpl = Symbol(lastp,2n-1)
-            push!(ex.args, :( $np = muladd($lpu,$lastx,$lpl) ) )
-        end
-        if oddNfrac
-            Nfrac1 += 1
-            push!(ex.args, :( $(Symbol(nextp,Nfrac1)) = $(Symbol(lastp,2Nfrac1-1)) ))
-        end
+# @generated function estrin(x, p::NTuple{N}) where {N}
+#     # N = length(p)
+#     # log2N = VectorizationBase.intlog2(N)
+#     ex = Expr(:block, Expr(:meta, :inline))
+#     Nfrac1 = N >> 1
+#     nextp = :p_1_
+#     nextx = :x_1
+#     Nfrac1 > 1 && push!(ex.args, :( $nextx = Base.FastMath.mul_fast(x, x)))
+#     for n ∈ 1:Nfrac1
+#         push!(ex.args, :( $(Symbol(nextp,n)) = muladd(p[$(2n)],x,p[$(2n-1)]) ) )
+#     end
+#     oddNfrac = isodd(N)
+#     if oddNfrac
+#         Nfrac1 += 1
+#         push!(ex.args, :( $(Symbol(nextp,Nfrac1)) = p[$N]  ))
+#     end
 
-        lastp = nextp
-        lastx = nextx
-    end
-    ex
-end
+#     lastp = nextp
+#     lastx = nextx
+#     depth = 1
+#     while Nfrac1 > 1
+#     # while Nfrac1 > 1
+#         oddNfrac = isodd(Nfrac1)
+#         Nfrac1 >>= 1
+#         depth += 1
+#         nextp = Symbol(:p_, depth, :_)
+#         nextx = Symbol(:x_, depth)
+#         (Nfrac1 > 1 || oddNfrac) && push!(ex.args, :( $nextx = Base.FastMath.mul_fast($lastx, $lastx)))
+#         for n ∈ 1:Nfrac1
+#             np = Symbol(nextp,n)
+#             # @show lastp, n
+#             lpu = Symbol(lastp,2n)
+#             lpl = Symbol(lastp,2n-1)
+#             push!(ex.args, :( $np = muladd($lpu,$lastx,$lpl) ) )
+#         end
+#         if oddNfrac
+#             Nfrac1 += 1
+#             push!(ex.args, :( $(Symbol(nextp,Nfrac1)) = $(Symbol(lastp,2Nfrac1-1)) ))
+#         end
+
+#         lastp = nextp
+#         lastx = nextx
+#     end
+#     ex
+# end
 # Given a `VecUnroll` argument, we'll instruction level parallelism that way and can thus forgo `estrin` 
-@inline estrin(x::VecUnroll, p::NTuple{N}) where {N} = evalpoly(x, p)
 
 # macro estrin(x, p...)
 #     t = Expr(:tuple); foreach(pᵢ -> push!(t.args, pᵢ), p)
@@ -56,22 +56,44 @@ end
 #     end
 # end
 
-macro horner(x, p...)
-    N = length(p)
-    ex = Expr(:call, :muladd, p[N], x, p[N-1])
-    for n ∈ 2:N-1
-        ex = Expr(:call, :muladd, ex, x, p[N-n])
-    end
-    esc(ex)
-end
+# macro horner(x, p...)
+#     N = length(p)
+#     ex = Expr(:call, :muladd, p[N], x, p[N-1])
+#     for n ∈ 2:N-1
+#         ex = Expr(:call, :muladd, ex, x, p[N-n])
+#     end
+#     esc(ex)
+# end
 
-@generated function evalpoly(x, p::Tuple{Vararg{Any,N}}) where {N}
-    ex = Expr(:call, :muladd, Expr(:ref, :p, N), :x, Expr(:ref, :p, N-1))
-    for n ∈ 2:N-1
-        ex = Expr(:call, :muladd, ex, :x, Expr(:ref, :p, N - n))
-    end
-    Expr(:block, Expr(:meta, :inline), ex)
+@inline evalpoly(x, p::Tuple{T1}) where {T1} = only(p)
+@inline evalpoly(x, p::Tuple{T1,T2}) where {T1,T2} = muladd(p[2], x, p[1])
+@inline evalpoly(x, p::Tuple{T1,T2,T3,Vararg{Any,N}}) where {T1,T2,T3,N} = evalpoly(x, (ntuple(n -> p[n], Val(N+1))..., muladd(p[end], x, p[end-1])))
+
+@inline estrin(x::VecUnroll, p::NTuple{N}) where {N} = evalpoly(x, p)
+@inline estrin(x, p::Tuple{Vararg{Any,N}}) where {N} = estrin(x, p, StaticInt{N}() & StaticInt{3}(), lt(StaticInt{N}(), StaticInt(7)))
+@inline estrin(x, p, r, ::True) = evalpoly(x, p)
+@inline function estrin(x, p::Tuple{Vararg{Any,N}}, ::StaticInt{0}, ::False) where {N}
+  x2 = Base.FastMath.mul_fast(x, x)
+  x4 = Base.FastMath.mul_fast(x2, x2)
+  res = muladd(x2, muladd(x, p[end], p[end-1]), muladd(x, p[end-2], p[end-3]))
+  return _estrin(x, x2, x4, res, ntuple(n -> p[n], Val(N-4)))
+end
+@inline function estrin(x, p::Tuple{Vararg{Any,N}}, ::StaticInt{R}, ::False) where {N,R}
+  x2 = Base.FastMath.mul_fast(x, x)
+  x4 = Base.FastMath.mul_fast(x2, x2)
+  res = evalpoly(x, p[N-R+1:N])
+  return _estrin(x, x2, x4, res, ntuple(n -> p[n], Val(N-R)))
+end
+@inline function __estrin(x, x2, x4, ex, p1, p2, p3, p4)
+    part = muladd(x2, muladd(x, p4, p3), muladd(x, p2, p1))
+    muladd(x4, ex, part)
 end
+@inline _estrin(x, x2, x4, ex, p::Tuple{}) = ex
+@inline function _estrin(x, x2, x4, ex, p::Tuple{T1,T2,T3,T4,Vararg{Any,N}}) where {T1,T2,T3,T4,N}
+  ex = _estrin(x, x2, x4, ex, ntuple(n -> p[n+4], Val(N)))
+  __estrin(x, x2, x4, ex, p[1], p[2], p[3], p[4])
+end
+
 
 # OscardSmith
 # https://github.com/JuliaLang/julia/blob/3253fb5a60ad841965eb6bd218921d55101c0842/base/special/expm1.jl
diff --git a/src/log.jl b/src/log.jl
index 6047266..b0958d9 100644
--- a/src/log.jl
+++ b/src/log.jl
@@ -94,23 +94,21 @@ end
 
 
 @inline function log_kernel(x::FloatType64)
-    c7 = 0.1532076988502701353
-    c6 = 0.1525629051003428716
-    c5 = 0.1818605932937785996
-    c4 = 0.2222214519839380009
-    c3 = 0.2857142932794299317
-    c2 = 0.3999999999635251990
-    c1 = 0.6666666666667333541
-    # return @horner x c1 c2 c3 c4 c5 c6 c7
-    @horner x c1 c2 c3 c4 c5 c6 c7
+  c7 = 0.1532076988502701353
+  c6 = 0.1525629051003428716
+  c5 = 0.1818605932937785996
+  c4 = 0.2222214519839380009
+  c3 = 0.2857142932794299317
+  c2 = 0.3999999999635251990
+  c1 = 0.6666666666667333541
+  evalpoly(x, (c1, c2, c3, c4, c5, c6, c7))
 end
 
 @inline function log_kernel(x::FloatType32)
-    c3 = 0.3027294874f0
-    c2 = 0.3996108174f0
-    c1 = 0.6666694880f0
-    # return @horner x c1 c2 c3
-    @horner x c1 c2 c3
+  c3 = 0.3027294874f0
+  c2 = 0.3996108174f0
+  c1 = 0.6666694880f0
+  evalpoly(x, (c1, c2, c3))
 end
 
 """
@@ -162,8 +160,7 @@ end
 #     c3 = 0.399999999950799600689777
 #     c2 = 0.6666666666667778740063
 #     c1 = 2.0
-#     # return @horner x c1 c2 c3 c4 c5 c6 c7 c8
-#     @horner x c1 c2 c3 c4 c5 c6 c7 c8
+#     evalpoly(x, (c1, c2, c3, c4, c5, c6, c7, c8))
 # end
 @inline function log_fast_kernel(::Val{ℯ}, x::FloatType64)
     c1 = 1.999999999999999972288314133660764626502058106566467649677782135685758742482496766144698785351773083811113458922204637526163881038549205101773551870109094623210762917254877090109553039007279469903453824801461040391573804774824166566643949526292524754553263795026695443159712389662581644078252968607010603610968
@@ -175,8 +172,7 @@ end
     c7 = 0.1524699883651617684758546885725609890936381489527976937281307452249553913565153843250171402249265467319884679549763243177823555571966975970148692848138108033225746264903112492670940278404127063267504949155893965147469747805989675713477911533708565646356998159446756108541482683919621746752450341943950918560444
     c8 = 0.1538953296978747663739188355461749838905796367288249763220945925738162292410415558446630046977678608900931196974342765569666836704427957456516824211567717339130678264230162068823354062924440216282160834915006269368499669546218116339647782657742158043373280480925865755011661730676024105585113850596556788214285
     
-    # return @horner x c1 c2 c3 c4 c5 c6 c7 c8
-    @horner x c1 c2 c3 c4 c5 c6 c7 c8
+  evalpoly(x, (c1, c2, c3, c4, c5, c6, c7, c8))
 end
 @inline function log_fast_kernel(::Val{2}, x::FloatType64)
     c1 = 2.885390081777926774740337587963391752686470697574765996133470083444891120861519512296991622220723412089079177191231548172714708142318482235816576480680610952734785448442320447132165628681849517673133529587846690979418974880507488664900789156089097059610996054569326290084125678471502654166548446893122338532023
@@ -187,8 +183,8 @@ end
     c6 = 0.2623764049675897374399611828075795820528035043402834678195862228934527298347670603110363990323094620293616051801837703306154765947894231616030376001347124009763901282915858969582585714993674594044171879858797430669924758656504158059304709147122269898951359911582928703927632984965613508778815160634440821006823
     c7 = 0.2199676960988168325549661966995124554567674873746539020720831896109731813706126058088861721617649492103067104363793480846581954569700692779507905838002757047383852895766988734919935420040379293796738154168364864473763605523658805065039735690708281857109533843972417256743512773874125120066398926457479452994093
     c8 = 0.2220240289710959406538132498293188569978091951805264791950528071212183891654240675907318516066736410222500677207442838638375330590597400913020365613995393179596880082388555123835749955456521729694538056168216831175849919088444866170319155410834510245750708062346271728171344176144233493167586972292956621704167    
-    # return @horner x c1 c2 c3 c4 c5 c6 c7 c8
-    @horner x c1 c2 c3 c4 c5 c6 c7 c8
+
+  evalpoly(x, (c1, c2, c3, c4, c5, c6, c7, c8))
 end
 
 @inline function log_fast_kernel(::Val{10}, x::FloatType64)
@@ -200,7 +196,8 @@ end
     c6 = 0.07898316804972451278986704059544004657534078944771282690278469570981935772429141983258957479757541438522692317478446375357141436448959705067029890831066506672402437452874301364583400492390678050347427582721177656604136026871846918722378134086776956936760280411925008797490847304302379623068922461071133054013859
     c7 = 0.06621687460284276437404220222283889587447460505118960613559060231236406623915686648073020335416144993466970222767859395725920508224420109118449188766790585839731181487203895006109028946170265819774012459305958251419929926567321073232894818683941128265790381145874410375046038091980215817264367754587024710978986
     c8 = 0.0668358924784686462819357730165824743917341557202951962573347574755028338084519992412011915655131252999581138520558248266369770803727027591714823269708741953473958951570623295145882252663691038603224938441400683179259967167971897078905444705976854175284181780118155805554122865814034985253134445374766613739187
-    @horner x c1 c2 c3 c4 c5 c6 c7 c8
+
+  evalpoly(x, (c1, c2, c3, c4, c5, c6, c7, c8))
 end
 
 @inline function log_fast_kernel(::Val{ℯ}, x::FloatType32)
@@ -209,8 +206,8 @@ end
     c3 = 0.400005877017974853515625f0
     c2 = 0.666666686534881591796875f0
     c1 = 2f0
-    # return @horner x c1 c2 c3 c4 c5
-    @horner x c1 c2 c3 c4 c5
+
+  evalpoly(x, (c1, c2, c3, c4, c5))
 end
 @inline function log_fast_kernel(::Val{2}, x::FloatType32)
     c1 = 2.88539008183608973931111465797605163855459400351399679091670150619994307807932305709230255880455898882523001788391899491983367652635183189227843545539636871821101647200186975131915853092890583610850196033734515715697697545252303451919374844119991942911339078566733293749421229056798365084549286067828816391335f0
@@ -218,7 +215,7 @@ end
     c3 = 0.577092346895436124496418053994048368787454897109979847565300121654461075967303615264336356613865144091476468885553992850165141154633791757163879758533181288454326796486333639237352641170689759463369795606769823631738673348747856212036970422229498298065338996708007695849071708840167941742938607388858010144124f0
     c4 = 0.4112062774884430872292144102146583634128259952701992109373810141836243377352557453727961104893802593899034166502781198307239642169574364520342372597806296006723519823066460247328403371685464127344456586539860933393102297655759579144420040193328399480261756827846184101221968710775674920677966868480884154531906f0
     c5 = 0.3483929578463614140008807858571049228792841000218239434906676609692043222435549289484194004801943792995337270459610071866199644093805754623761881367467842177198732930226925274643266617665940920235915835781226498423330347120803047035104419586882902933922994002620011702649787056467119273317636032117141887959942f0
-    @horner x c1 c2 c3 c4 c5
+  evalpoly(x, (c1, c2, c3, c4, c5))
 end
 @inline function log_fast_kernel(::Val{10}, x::FloatType32)
     c1 =  0.8685889638240124402397708951106502262202360135356954454698763954975124221040424849066825167876820875325127784493624787542409796165383940126454679616909814259580253262132903371754228716156380963198674619876749182361498397961445142358261199400985591909478187454632084248453424851342102079761821940780603198545616f0
@@ -226,7 +223,7 @@ end
     c3 = 0.1737221066836498683203094026804327752209318893377756731259494918401339926199681736692287989621095193583635545978137901741612449570980653896864502278390604987366493020260231450208306389176678539877072993190077953368719548863200253103843475249767868540737686876012511040200964468493328048981851042321833775224372f0
     c4 = 0.1237854239293478707125637927629425829204203853315914667487437505152964085560979536798787545674711702014151928256831535140556118705532892713919312267005970112932094938487205764203670341705365514082680822781583742792219266327012272008036123492184419119821962482310107701115807828778871264690149628141482055104599f0
     c5 = 0.1048767305898517597797548767722330999824230472712341470034472461777359592830413739746852188570789846396677304208657150772765766584594750277172294014192856723183380665073137335352084472870754706334151185602395485903693315152058182538357857178779149581856966210549355606555462782599244992615178411271029676590258f0
-    @horner x c1 c2 c3 c4 c5
+  evalpoly(x, (c1, c2, c3, c4, c5))
 end
 
 """
diff --git a/src/misc.jl b/src/misc.jl
index f644bb4..1d7fff1 100644
--- a/src/misc.jl
+++ b/src/misc.jl
@@ -31,22 +31,22 @@ end
 
 
 @inline function cbrt_kernel(x::FloatType64)
-    c6d = -0.640245898480692909870982
-    c5d = 2.96155103020039511818595
-    c4d = -5.73353060922947843636166
-    c3d = 6.03990368989458747961407
-    c2d = -3.85841935510444988821632
-    c1d = 2.2307275302496609725722
-    @horner x c1d c2d c3d c4d c5d c6d
+  c6 = -0.640245898480692909870982
+  c5 = 2.96155103020039511818595
+  c4 = -5.73353060922947843636166
+  c3 = 6.03990368989458747961407
+  c2 = -3.85841935510444988821632
+  c1 = 2.2307275302496609725722
+  evalpoly(x, (c1, c2, c3, c4, c5, c6))
 end
 @inline function cbrt_kernel(x::FloatType32)
-    c6f = -0.601564466953277587890625f0
-    c5f =  2.8208892345428466796875f0
-    c4f = -5.532182216644287109375f0
-    c3f =  5.898262500762939453125f0
-    c2f = -3.8095417022705078125f0
-    c1f =  2.2241256237030029296875f0
-    @horner x c1f c2f c3f c4f c5f c6f
+  c6 = -0.601564466953277587890625f0
+  c5 =  2.8208892345428466796875f0
+  c4 = -5.532182216644287109375f0
+  c3 =  5.898262500762939453125f0
+  c2 = -3.8095417022705078125f0
+  c1 =  2.2241256237030029296875f0
+  evalpoly(x, (c1, c2, c3, c4, c5, c6))
 end
 """
 Algorithm:
diff --git a/src/priv.jl b/src/priv.jl
index c0eac17..b123299 100644
--- a/src/priv.jl
+++ b/src/priv.jl
@@ -256,13 +256,13 @@ const under_expk(::Type{Float32}) = -104f0
     return estrin(x, (c1, c2, c3, c4, c5, c6, c7, c8, c9, c10))
 end
 
-@inline function  expk_kernel(x::FloatType32)
+@inline function expk_kernel(x::FloatType32)
     c5 = 0.00136324646882712841033936f0
     c4 = 0.00836596917361021041870117f0
     c3 = 0.0416710823774337768554688f0
     c2 = 0.166665524244308471679688f0
     c1 = 0.499999850988388061523438f0
-    return @horner x c1 c2 c3 c4 c5
+    return evalpoly(x, (c1, c2, c3, c4, c5))
 end
 
 @inline function expk(d::Double{V}) where {V <: vIEEEFloat}
@@ -303,13 +303,13 @@ end
     return dadd(dmul(x, u), c1)
 end
 
-@inline function  expk2_kernel(x::Double{<:FloatType32})
+@inline function expk2_kernel(x::Double{<:FloatType32})
     c5 = 0.1980960224f-3
     c4 = 0.1394256484f-2
     c3 = 0.8333456703f-2
     c2 = 0.4166637361f-1
     c1 = 0.166666659414234244790680580464f0
-    u = @horner x.hi c2 c3 c4 c5
+    u = evalpoly(x.hi, (c2, c3, c4, c5))
     return dadd(dmul(x, u), c1)
 end
 
@@ -343,7 +343,7 @@ end
     c3 = 0.285714285511134091777308
     c2 = 0.400000000000914013309483
     c1 = 0.666666666666664853302393
-    return @horner x c1 c2 c3 c4 c5 c6 c7 c8
+    return evalpoly(x, (c1, c2, c3, c4, c5, c6, c7, c8))
 end
 
 @inline function logk2_kernel(x::FloatType32)
@@ -351,7 +351,7 @@ end
     c3 = 0.285112679004669189453125f0
     c2 = 0.400007992982864379882812f0
     c1 = 0.666666686534881591796875f0
-    return @horner x c1 c2 c3 c4
+    return evalpoly(x, (c1, c2, c3, c4))
 end
 
 @inline function logk2(d::Double{V}) where {V <: vIEEEFloat}
@@ -393,7 +393,7 @@ end
     c3 = 0.285112679004669189453125f0
     c2 = 0.400007992982864379882812f0
     c1 = Double(0.66666662693023681640625f0, 3.69183861259614332084311f-9)
-    dadd(dmul(x, @horner x.hi c2 c3 c4), c1)
+    dadd(dmul(x, evalpoly(x.hi, (c2, c3, c4))), c1)
 end
 
 logkmul(::Type{Float64}) = 1.8446744073709551616e19
diff --git a/src/trig.jl b/src/trig.jl
index c4d8153..cd8ea22 100644
--- a/src/trig.jl
+++ b/src/trig.jl
@@ -31,7 +31,7 @@ end
     c3 = -0.0001981069071916863322258f0
     c2 =  0.00833307858556509017944336f0
     c1 = -0.166666597127914428710938f0
-    return dadd(c1, (x.hi * (@horner x.hi c2 c3 c4)))
+    return dadd(c1, (x.hi * (evalpoly(x.hi, (c2, c3, c4)))))
 end
 
 @inline function sin(d::V) where V <: FloatType64
@@ -189,7 +189,7 @@ end
     c3 = -0.0001981069071916863322258f0
     c2 =  0.00833307858556509017944336f0
     c1 = -0.166666597127914428710938f0
-    return @horner x c1 c2 c3 c4
+    return evalpoly(x, (c1, c2, c3, c4))
 end
 
 @inline function sin_fast(d::FloatType64)
@@ -329,14 +329,14 @@ function sincos_fast end
     a3 = -0.000198412698278911770864914
     a2 =  0.0083333333333191845961746
     a1 = -0.166666666666666130709393
-    return @horner x a1 a2 a3 a4 a5 a6
+    return evalpoly(x, (a1, a2, a3, a4, a5, a6))
 end
 
 @inline function sincos_a_kernel(x::FloatType32)
     a3 = -0.000195169282960705459117889f0
     a2 =  0.00833215750753879547119141f0
     a1 = -0.166666537523269653320312f0
-    return @horner x a1 a2 a3
+    return evalpoly(x, (a1, a2, a3))
 end
 
 @inline function sincos_b_kernel(x::FloatType64)
@@ -347,7 +347,7 @@ end
     b3 = -0.00138888888888714019282329
     b2 =  0.0416666666666665519592062
     b1 = -0.50
-    return @horner x b1 b2 b3 b4 b5 b6 b7
+    return evalpoly(x, (b1, b2, b3, b4, b5, b6, b7))
 end
 
 @inline function sincos_b_kernel(x::FloatType32)
@@ -356,7 +356,7 @@ end
     b3 = -0.00138888787478208541870117f0
     b2 =  0.0416666641831398010253906f0
     b1 = -0.5f0
-    return @horner x b1 b2 b3 b4 b5
+    return evalpoly(x, (b1, b2, b3, b4, b5))
 end
 
 @inline function sincos_fast(d::FloatType64)
@@ -596,7 +596,7 @@ end
     c3 =  0.0540687143802642822265625f0
     c2 =  0.133325666189193725585938f0
     c1 =  0.33333361148834228515625f0
-    return @horner x c1 c2 c3 c4 c5 c6 c7
+    return evalpoly(x, (c1, c2, c3, c4, c5, c6, c7))
 end
 
 @inline function tan_fast(d::FloatType64)
@@ -688,7 +688,7 @@ end
     c3 =  0.0540687143802642822265625f0
     c2 =  0.133325666189193725585938f0
     c1 =  0.33333361148834228515625f0
-    return dadd(c1,  x.hi * (@horner x.hi c2 c3 c4 c5 c6 c7))
+    return dadd(c1,  x.hi * evalpoly(x.hi, (c2, c3, c4, c5, c6, c7)))
 end
 
 @inline function tan(d::V) where V <: FloatType64