From bf58e1158ab882951168b470219563be265a7302 Mon Sep 17 00:00:00 2001 From: Andrey Zgarbul Date: Mon, 8 Aug 2022 15:46:34 +0300 Subject: [PATCH 1/5] release 0.2.0 --- Cargo.toml | 2 +- README.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 78d6bba..a7e1f82 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "sleef" edition = "2021" -version = "0.1.0" +version = "0.2.0" authors = ["Andrey Zgarbul "] description = "Math functions for SIMD vectors" keywords = ["simd", "libm", "math"] diff --git a/README.md b/README.md index d393691..f2222e5 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ # sleef-rs -Rust port of [Sleef] math library based on [Portable Packed SIMD Vectors] +Rust port of [Sleef] math library based on [Portable SIMD Vectors] a.k.a. `core::simd` [Sleef]: https://github.com/shibatch/sleef/ -[Portable Packed SIMD Vectors]: https://github.com/rust-lang/packed_simd \ No newline at end of file +[Portable Packed SIMD Vectors]: https://github.com/rust-lang/portable-simd \ No newline at end of file From a766d5d50cc2c6e8ba996fa038187a24b7b9ae39 Mon Sep 17 00:00:00 2001 From: Andrey Zgarbul Date: Mon, 8 Aug 2022 18:28:57 +0300 Subject: [PATCH 2/5] cleanups --- src/common.rs | 2 -- src/f32.rs | 8 -------- src/f32x.rs | 18 ------------------ src/f32x/u10_impl.rs | 4 ++-- src/f32x/u35_impl.rs | 8 ++++---- src/f64.rs | 8 -------- src/f64x.rs | 20 -------------------- src/f64x/u10_impl.rs | 4 ++-- src/f64x/u35_impl.rs | 8 ++++---- 9 files changed, 12 insertions(+), 68 deletions(-) diff --git a/src/common.rs b/src/common.rs index f15608c..83b8d7c 100644 --- a/src/common.rs +++ b/src/common.rs @@ -100,8 +100,6 @@ where } pub trait Sign: MaskType + BitsType { - /* fn is_sign_negative(self) -> Self::Mask; - fn is_sign_positive(self) -> Self::Mask;*/ fn sign_bit(self) -> Self::Bits; fn sign(self) -> Self; fn mul_sign(self, other: Self) -> Self; diff --git a/src/f32.rs b/src/f32.rs index e2cd6ed..2c40a1c 100644 --- a/src/f32.rs +++ b/src/f32.rs @@ -495,14 +495,6 @@ impl Poly for f32 { } impl Sign for f32 { - /* #[inline] - fn is_sign_negative(self) -> Self::Mask { - self.is_sign_negative() - } - #[inline] - fn is_sign_positive(self) -> Self::Mask { - self.is_sign_positive() - }*/ #[inline] fn sign_bit(self) -> Self::Bits { self.to_bits() & (1 << 31) diff --git a/src/f32x.rs b/src/f32x.rs index a259eba..6f2a59e 100644 --- a/src/f32x.rs +++ b/src/f32x.rs @@ -647,25 +647,7 @@ macro_rules! impl_math_f32 { } } - #[inline] - fn vsel_vi2_vf_vf_vi2_vi2(f0: F32x, f1: F32x, x: I32x, y: I32x) -> I32x { - f0.simd_lt(f1).select(x, y) - } - - #[inline] - fn vsel_vi2_vf_vi2(d: F32x, x: I32x) -> I32x { - d.is_sign_negative().to_int() & x - } - impl Sign for F32x { -/* #[inline] - fn is_sign_negative(self) -> Self::Mask { - self.sign_bit().simd_ne(Self::Bits::splat(0)) - } - #[inline] - fn is_sign_positive(self) -> Self::Mask { - !self.is_sign_negative() - }*/ #[inline] fn sign_bit(self) -> Self::Bits { self.to_bits() & NEG_ZERO.to_bits() diff --git a/src/f32x/u10_impl.rs b/src/f32x/u10_impl.rs index 14e52ef..64882c8 100644 --- a/src/f32x/u10_impl.rs +++ b/src/f32x/u10_impl.rs @@ -542,7 +542,7 @@ macro_rules! impl_math_f32_u10 { #[inline] fn atan2kf_u1(y: Doubled, mut x: Doubled) -> Doubled { - let q = vsel_vi2_vf_vf_vi2_vi2(x.0, ZERO, I32x::splat(-2), I32x::splat(0)); + let q = x.0.simd_lt(ZERO).select(I32x::splat(-2), I32x::splat(0)); let p = x.0.simd_lt(ZERO); let r = p.to_int().cast() & NEG_ZERO.to_bits(); x = Doubled::new( @@ -550,7 +550,7 @@ macro_rules! impl_math_f32_u10 { F32x::from_bits(x.1.to_bits() ^ r) ); - let q = vsel_vi2_vf_vf_vi2_vi2(x.0, y.0, q + I32x::splat(1), q); + let q = x.0.simd_lt(y.0).select(q + I32x::splat(1), q); let p = x.0.simd_lt(y.0); let s = p.select_doubled(-x, y); let mut t = p.select_doubled(y, x); diff --git a/src/f32x/u35_impl.rs b/src/f32x/u35_impl.rs index 54e245e..816fd6a 100644 --- a/src/f32x/u35_impl.rs +++ b/src/f32x/u35_impl.rs @@ -654,10 +654,10 @@ macro_rules! impl_math_f32_u35 { #[inline] fn atan2kf(y: F32x, x: F32x) -> F32x { - let q = vsel_vi2_vf_vi2(x, I32x::splat(-2)); + let q = x.is_sign_negative().to_int() & I32x::splat(-2); let x = x.abs(); - let q = vsel_vi2_vf_vf_vi2_vi2(x, y, q + I32x::splat(1), q); + let q = x.simd_lt(y).select(q + I32x::splat(1), q); let p = x.simd_lt(y); let s = p.select(-x, y); let mut t = x.simd_max(y); @@ -790,10 +790,10 @@ macro_rules! impl_math_f32_u35 { /// These functions evaluates the arc tangent function of a value in ***a***. /// The error bound of the returned value is `3.5 ULP`. pub fn atanf(d: F32x) -> F32x { - let q = vsel_vi2_vf_vi2(d, I32x::splat(2)); + let q = d.is_sign_negative().to_int() & I32x::splat(2); let s = d.abs(); - let q = vsel_vi2_vf_vf_vi2_vi2(ONE, s, q + I32x::splat(1), q); + let q = ONE.simd_lt(s).select(q + I32x::splat(1), q); let s = ONE.simd_lt(s).select(s.recip(), s); let mut t = s * s; diff --git a/src/f64.rs b/src/f64.rs index c91df15..1d9dfba 100644 --- a/src/f64.rs +++ b/src/f64.rs @@ -509,14 +509,6 @@ impl Poly for f64 { } impl Sign for f64 { - /* #[inline] - fn is_sign_negative(self) -> Self::Mask { - self.is_sign_negative() - } - #[inline] - fn is_sign_positive(self) -> Self::Mask { - self.is_sign_positive() - }*/ #[inline] fn sign_bit(self) -> Self::Bits { self.to_bits() & (1 << 63) diff --git a/src/f64x.rs b/src/f64x.rs index d974000..aa59935 100644 --- a/src/f64x.rs +++ b/src/f64x.rs @@ -608,27 +608,7 @@ macro_rules! impl_math_f64 { } } - // return d0 < d1 ? x : y - #[inline] - fn vsel_vi_vd_vd_vi_vi(d0: F64x, d1: F64x, x: Ix, y: Ix) -> Ix { - d0.simd_lt(d1).cast().select(x, y) - } - - // return d0 < 0 ? x : 0 - #[inline] - fn vsel_vi_vd_vi(d: F64x, x: Ix) -> Ix { - d.is_sign_negative().cast::().to_int() & x - } - impl Sign for F64x { -/* #[inline] - fn is_sign_negative(self) -> Self::Mask { - self.sign_bit().simd_ne(Self::Bits::splat(0)) - } - #[inline] - fn is_sign_positive(self) -> Self::Mask { - !self.is_sign_negative() - }*/ #[inline] fn sign_bit(self) -> Self::Bits { self.to_bits() & NEG_ZERO.to_bits() diff --git a/src/f64x/u10_impl.rs b/src/f64x/u10_impl.rs index 05348e5..56316fc 100644 --- a/src/f64x/u10_impl.rs +++ b/src/f64x/u10_impl.rs @@ -711,7 +711,7 @@ macro_rules! impl_math_f64_u10 { #[inline] fn atan2k_u1(y: Doubled, mut x: Doubled) -> Doubled { - let q = vsel_vi_vd_vi(x.0, Ix::splat(-2)); + let q = x.0.is_sign_negative().cast().to_int() & Ix::splat(-2); let p = x.0.simd_lt(ZERO); let b = p.to_int().cast() & NEG_ZERO.to_bits(); x = Doubled::new( @@ -719,7 +719,7 @@ macro_rules! impl_math_f64_u10 { F64x::from_bits(b ^ x.1.to_bits()) ); - let q = vsel_vi_vd_vd_vi_vi(x.0, y.0, q + Ix::splat(1), q); + let q = x.0.simd_lt(y.0).cast().select(q + Ix::splat(1), q); let p = x.0.simd_lt(y.0); let s = p.select_doubled(-x, y); let mut t = p.select_doubled(y, x); diff --git a/src/f64x/u35_impl.rs b/src/f64x/u35_impl.rs index 7cd86a2..8f8c979 100644 --- a/src/f64x/u35_impl.rs +++ b/src/f64x/u35_impl.rs @@ -739,10 +739,10 @@ macro_rules! impl_math_f64_u35 { #[inline] fn atan2k(y: F64x, x: F64x) -> F64x { - let q = vsel_vi_vd_vi(x, Ix::splat(-2)); + let q = x.is_sign_negative().cast().to_int() & Ix::splat(-2); let x = x.abs(); - let q = vsel_vi_vd_vd_vi_vi(x, y, q + Ix::splat(1), q); + let q = x.simd_lt(y).cast().select(q + Ix::splat(1), q); let p = x.simd_lt(y); let s = p.select(-x, y); let mut t = x.simd_max(y); @@ -924,10 +924,10 @@ macro_rules! impl_math_f64_u35 { let w = s; }*/ - let q = vsel_vi_vd_vi(s, Ix::splat(2)); + let q = s.is_sign_negative().cast().to_int() & Ix::splat(2); s = s.abs(); - let q = vsel_vi_vd_vd_vi_vi(ONE, s, q + Ix::splat(1), q); + let q = ONE.simd_lt(s).cast().select(q + Ix::splat(1), q); s = ONE.simd_lt(s).select(s.recip(), s); let mut t = s * s; From 4d5aa3c90531e716df08ce1bb432763f8cf058c2 Mon Sep 17 00:00:00 2001 From: Andrey Zgarbul Date: Mon, 8 Aug 2022 18:34:31 +0300 Subject: [PATCH 3/5] changelog --- CHANGELOG.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..fe6a583 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,19 @@ +# Change Log + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](http://keepachangelog.com/) +and this project adheres to [Semantic Versioning](http://semver.org/). + +## [Unreleased] + +## [v0.2.0] - 2022-08-08 + +### Changed + +- Ported to `core::simd` + +## [v0.1.0] - 2022-08-05 + +[Unreleased]: https://github.com/rust-embedded/svd2rust/compare/v0.2.0...HEAD +[v0.2.0]: https://github.com/rust-embedded/svd2rust/compare/v0.1.0...v0.2.0 From 3ea454c3f142c8a40e4ab25c6d0d03f6a001946d Mon Sep 17 00:00:00 2001 From: Andrey Zgarbul Date: Mon, 8 Aug 2022 18:53:54 +0300 Subject: [PATCH 4/5] don't use mul_add if absent --- src/f32x.rs | 24 ++++++++++++++++++------ src/f64x.rs | 24 ++++++++++++++++++------ 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/src/f32x.rs b/src/f32x.rs index 6f2a59e..69ad49a 100644 --- a/src/f32x.rs +++ b/src/f32x.rs @@ -553,24 +553,36 @@ macro_rules! impl_math_f32 { impl MulAdd for F32x { #[inline] fn mul_add(self, y: Self, z: Self) -> Self { - use std::simd::{StdFloat}; - ::mul_add(self, y, z) + if cfg!(target_feature = "fma") { + use std::simd::{StdFloat}; + ::mul_add(self, y, z) + } else { + self * y + z + } } } impl MulSub for F32x { #[inline] fn mul_sub(self, y: Self, z: Self) -> Self { - use std::simd::{StdFloat}; - ::mul_add(self, y, -z) + if cfg!(target_feature = "fma") { + use std::simd::{StdFloat}; + ::mul_add(self, y, -z) + } else { + self * y - z + } } } impl NegMulAdd for F32x { #[inline] fn neg_mul_add(self, y: Self, z: Self) -> Self { - use std::simd::{StdFloat}; - ::mul_add(-self, y, z) + if cfg!(target_feature = "fma") { + use std::simd::{StdFloat}; + ::mul_add(-self, y, z) + } else { + -self * y + z + } } } diff --git a/src/f64x.rs b/src/f64x.rs index aa59935..9e07170 100644 --- a/src/f64x.rs +++ b/src/f64x.rs @@ -526,24 +526,36 @@ macro_rules! impl_math_f64 { impl MulAdd for F64x { #[inline] fn mul_add(self, y: Self, z: Self) -> Self { - use std::simd::{StdFloat}; - ::mul_add(self, y, z) + if cfg!(target_feature = "fma") { + use std::simd::{StdFloat}; + ::mul_add(self, y, z) + } else { + self * y + z + } } } impl MulSub for F64x { #[inline] fn mul_sub(self, y: Self, z: Self) -> Self { - use std::simd::{StdFloat}; - ::mul_add(self, y, -z) + if cfg!(target_feature = "fma") { + use std::simd::{StdFloat}; + ::mul_add(self, y, -z) + } else { + self * y - z + } } } impl NegMulAdd for F64x { #[inline] fn neg_mul_add(self, y: Self, z: Self) -> Self { - use std::simd::{StdFloat}; - ::mul_add(-self, y, z) + if cfg!(target_feature = "fma") { + use std::simd::{StdFloat}; + ::mul_add(-self, y, z) + } else { + -self * y + z + } } } From 508a0a81c08a0d2cef310ccda10cb8d5d24eeea7 Mon Sep 17 00:00:00 2001 From: Andrey Zgarbul Date: Mon, 8 Aug 2022 20:01:58 +0300 Subject: [PATCH 5/5] mul_add -> mla --- src/common.rs | 46 +++--- src/f32.rs | 22 +-- src/f32/fast.rs | 44 +++--- src/f32/u05.rs | 8 +- src/f32/u10.rs | 166 +++++++++++----------- src/f32/u15.rs | 8 +- src/f32/u35.rs | 174 +++++++++++------------ src/f32x.rs | 34 ++--- src/f32x/fast_impl.rs | 46 +++--- src/f32x/u05_impl.rs | 8 +- src/f32x/u10_impl.rs | 212 ++++++++++++++-------------- src/f32x/u15_impl.rs | 8 +- src/f32x/u35_impl.rs | 294 +++++++++++++++++++------------------- src/f64.rs | 44 +++--- src/f64/u05.rs | 30 ++-- src/f64/u10.rs | 144 +++++++++---------- src/f64/u15.rs | 34 ++--- src/f64/u35.rs | 218 ++++++++++++++--------------- src/f64x.rs | 30 ++-- src/f64x/u05_impl.rs | 30 ++-- src/f64x/u10_impl.rs | 224 ++++++++++++++--------------- src/f64x/u15_impl.rs | 34 ++--- src/f64x/u35_impl.rs | 318 +++++++++++++++++++++--------------------- 23 files changed, 1092 insertions(+), 1084 deletions(-) diff --git a/src/common.rs b/src/common.rs index 83b8d7c..2bee51d 100644 --- a/src/common.rs +++ b/src/common.rs @@ -76,7 +76,7 @@ pub trait Round { } pub trait MulAdd { - fn mul_add(self, y: Self, z: Self) -> Self; + fn mla(self, y: Self, z: Self) -> Self; } pub trait MulSub { @@ -139,25 +139,25 @@ where { fn c2v(c: B) -> Self; fn poly2(x: Self, c1: B, c0: B) -> Self { - x.mul_add(Poly::c2v(c1), Poly::c2v(c0)) + x.mla(Poly::c2v(c1), Poly::c2v(c0)) } fn poly3(x: Self, x2: Self, c2: B, c1: B, c0: B) -> Self { - x2.mul_add(Poly::c2v(c2), x.mul_add(Poly::c2v(c1), Poly::c2v(c0))) + x2.mla(Poly::c2v(c2), x.mla(Poly::c2v(c1), Poly::c2v(c0))) } fn poly4(x: Self, x2: Self, c3: B, c2: B, c1: B, c0: B) -> Self { - x2.mul_add( - x.mul_add(Poly::c2v(c3), Poly::c2v(c2)), - x.mul_add(Poly::c2v(c1), Poly::c2v(c0)), + x2.mla( + x.mla(Poly::c2v(c3), Poly::c2v(c2)), + x.mla(Poly::c2v(c1), Poly::c2v(c0)), ) } fn poly5(x: Self, x2: Self, x4: Self, c4: B, c3: B, c2: B, c1: B, c0: B) -> Self { - x4.mul_add(Poly::c2v(c4), Poly::poly4(x, x2, c3, c2, c1, c0)) + x4.mla(Poly::c2v(c4), Poly::poly4(x, x2, c3, c2, c1, c0)) } fn poly6(x: Self, x2: Self, x4: Self, c5: B, c4: B, c3: B, c2: B, c1: B, c0: B) -> Self { - x4.mul_add(Poly::poly2(x, c5, c4), Poly::poly4(x, x2, c3, c2, c1, c0)) + x4.mla(Poly::poly2(x, c5, c4), Poly::poly4(x, x2, c3, c2, c1, c0)) } fn poly7(x: Self, x2: Self, x4: Self, c6: B, c5: B, c4: B, c3: B, c2: B, c1: B, c0: B) -> Self { - x4.mul_add( + x4.mla( Poly::poly3(x, x2, c6, c5, c4), Poly::poly4(x, x2, c3, c2, c1, c0), ) @@ -175,7 +175,7 @@ where c1: B, c0: B, ) -> Self { - x4.mul_add( + x4.mla( Poly::poly4(x, x2, c7, c6, c5, c4), Poly::poly4(x, x2, c3, c2, c1, c0), ) @@ -195,7 +195,7 @@ where c1: B, c0: B, ) -> Self { - x8.mul_add( + x8.mla( Poly::c2v(c8), Poly::poly8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0), ) @@ -216,7 +216,7 @@ where c1: B, c0: B, ) -> Self { - x8.mul_add( + x8.mla( Poly::poly2(x, c9, c8), Poly::poly8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0), ) @@ -238,7 +238,7 @@ where c1: B, c0: B, ) -> Self { - x8.mul_add( + x8.mla( Poly::poly3(x, x2, ca, c9, c8), Poly::poly8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0), ) @@ -261,7 +261,7 @@ where c1: B, c0: B, ) -> Self { - x8.mul_add( + x8.mla( Poly::poly4(x, x2, cb, ca, c9, c8), Poly::poly8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0), ) @@ -285,7 +285,7 @@ where c1: B, c0: B, ) -> Self { - x8.mul_add( + x8.mla( Poly::poly5(x, x2, x4, cc, cb, ca, c9, c8), Poly::poly8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0), ) @@ -310,7 +310,7 @@ where c1: B, c0: B, ) -> Self { - x8.mul_add( + x8.mla( Poly::poly6(x, x2, x4, cd, cc, cb, ca, c9, c8), Poly::poly8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0), ) @@ -336,7 +336,7 @@ where c1: B, c0: B, ) -> Self { - x8.mul_add( + x8.mla( Poly::poly7(x, x2, x4, ce, cd, cc, cb, ca, c9, c8), Poly::poly8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0), ) @@ -363,7 +363,7 @@ where c1: B, c0: B, ) -> Self { - x8.mul_add( + x8.mla( Poly::poly8(x, x2, x4, cf, ce, cd, cc, cb, ca, c9, c8), Poly::poly8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0), ) @@ -392,7 +392,7 @@ where c1: B, c0: B, ) -> Self { - x16.mul_add( + x16.mla( Poly::c2v(d0), Poly::poly16( x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0, @@ -424,7 +424,7 @@ where c1: B, c0: B, ) -> Self { - x16.mul_add( + x16.mla( Poly::poly2(x, d1, d0), Poly::poly16( x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0, @@ -457,7 +457,7 @@ where c1: B, c0: B, ) -> Self { - x16.mul_add( + x16.mla( Poly::poly3(x, x2, d2, d1, d0), Poly::poly16( x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0, @@ -491,7 +491,7 @@ where c1: B, c0: B, ) -> Self { - x16.mul_add( + x16.mla( Poly::poly4(x, x2, d3, d2, d1, d0), Poly::poly16( x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0, @@ -526,7 +526,7 @@ where c1: B, c0: B, ) -> Self { - x16.mul_add( + x16.mla( Poly::poly5(x, x2, x4, d4, d3, d2, d1, d0), Poly::poly16( x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0, diff --git a/src/f32.rs b/src/f32.rs index 2c40a1c..52f1d4b 100644 --- a/src/f32.rs +++ b/src/f32.rs @@ -483,8 +483,12 @@ impl BitsType for f32 { impl MulAdd for f32 { #[inline] - fn mul_add(self, y: Self, z: Self) -> Self { - self * y + z + fn mla(self, y: Self, z: Self) -> Self { + if cfg!(target_feature = "fma") { + self.mul_add(y, z) + } else { + self * y + z + } } } @@ -672,9 +676,9 @@ fn expk2f(d: Doubled) -> Doubled { s += qf * -L2L_F; let u = 0.198_096_022_4_e-3_f32 - .mul_add(s.0, 0.139_425_648_4_e-2) - .mul_add(s.0, 0.833_345_670_3_e-2) - .mul_add(s.0, 0.416_663_736_1_e-1); + .mla(s.0, 0.139_425_648_4_e-2) + .mla(s.0, 0.833_345_670_3_e-2) + .mla(s.0, 0.416_663_736_1_e-1); let mut t = s * u + 0.166_666_659_414_234_244_790_680_580_464; t = s * t + 0.5; @@ -709,7 +713,7 @@ fn sinpifk(d: f32) -> Doubled { } else { 0.309_384_205_4_e-6 }) - .mul_add( + .mla( s, if o { 0.359_057_708_e-5 @@ -717,7 +721,7 @@ fn sinpifk(d: f32) -> Doubled { -0.365_730_738_8_e-4 }, ) - .mul_add( + .mla( s, if o { -0.325_991_772_1_e-3 @@ -779,7 +783,7 @@ fn cospifk(d: f32) -> Doubled { } else { 0.309_384_205_4_e-6 }) - .mul_add( + .mla( s, if o { 0.359_057_708_e-5 @@ -787,7 +791,7 @@ fn cospifk(d: f32) -> Doubled { -0.365_730_738_8_e-4 }, ) - .mul_add( + .mla( s, if o { -0.325_991_772_1_e-3 diff --git a/src/f32/fast.rs b/src/f32/fast.rs index 4c6251b..656f076 100644 --- a/src/f32/fast.rs +++ b/src/f32/fast.rs @@ -9,14 +9,14 @@ pub fn sinf(mut d: f32) -> f32 { let t = d; let q = rintfk(d * FRAC_1_PI); - d = q.mul_add(-PI, d); + d = q.mla(-PI, d); let s = d * d; let mut u = (-0.188_174_817_6_e-3) - .mul_add(s, 0.832_350_272_7_e-2) - .mul_add(s, -0.166_665_136_8); - u = (s * d).mul_add(u, d); + .mla(s, 0.832_350_272_7_e-2) + .mla(s, -0.166_665_136_8); + u = (s * d).mla(u, d); if ((q as i32) & 1) != 0 { u = -u; @@ -46,15 +46,15 @@ fn test_sinf() { pub fn cosf(mut d: f32) -> f32 { let t = d; - let q = rintfk(d.mul_add(FRAC_1_PI, -0.5)); - d = q.mul_add(-PI, d - FRAC_PI_2); + let q = rintfk(d.mla(FRAC_1_PI, -0.5)); + d = q.mla(-PI, d - FRAC_PI_2); let s = d * d; let mut u = (-0.188_174_817_6_e-3) - .mul_add(s, 0.832_350_272_7_e-2) - .mul_add(s, -0.166_665_136_8); - u = (s * d).mul_add(u, d); + .mla(s, 0.832_350_272_7_e-2) + .mla(s, -0.166_665_136_8); + u = (s * d).mla(u, d); if ((q as i32) & 1) == 0 { u = -u; @@ -96,29 +96,29 @@ fn logk3f(mut d: f32) -> f32 { let x2 = x * x; let t = 0.239_282_846_450_805_664_062_5 - .mul_add(x2, 0.285_182_118_415_832_519_531_25) - .mul_add(x2, 0.400_005_877_017_974_853_515_625) - .mul_add(x2, 0.666_666_686_534_881_591_796_875) - .mul_add(x2, 2.); + .mla(x2, 0.285_182_118_415_832_519_531_25) + .mla(x2, 0.400_005_877_017_974_853_515_625) + .mla(x2, 0.666_666_686_534_881_591_796_875) + .mla(x2, 2.); - x.mul_add(t, 0.693_147_180_559_945_286_226_764 * (e as f32)) + x.mla(t, 0.693_147_180_559_945_286_226_764 * (e as f32)) } #[inline] fn expk3f(d: f32) -> f32 { let q = rintfk(d * R_LN2_F); - let mut s = q.mul_add(-L2U_F, d); - s = q.mul_add(-L2L_F, s); + let mut s = q.mla(-L2U_F, d); + s = q.mla(-L2L_F, s); let mut u = 0.000_198_527_617_612_853_646_278_381 - .mul_add(s, 0.001_393_043_552_525_341_510_772_71) - .mul_add(s, 0.008_333_360_776_305_198_669_433_59) - .mul_add(s, 0.041_666_485_369_205_474_853_515_6) - .mul_add(s, 0.166_666_671_633_720_397_949_219) - .mul_add(s, 0.5); + .mla(s, 0.001_393_043_552_525_341_510_772_71) + .mla(s, 0.008_333_360_776_305_198_669_433_59) + .mla(s, 0.041_666_485_369_205_474_853_515_6) + .mla(s, 0.166_666_671_633_720_397_949_219) + .mla(s, 0.5); - u = (s * s).mul_add(u, s + 1.); + u = (s * s).mla(u, s + 1.); u = ldexpkf(u, q as i32); if d < -104. { diff --git a/src/f32/u05.rs b/src/f32/u05.rs index 3bf322e..3df4779 100644 --- a/src/f32/u05.rs +++ b/src/f32/u05.rs @@ -20,8 +20,8 @@ pub fn sincospif(d: f32) -> (f32, f32) { // let u = 0.309_384_205_4_e-6_f32 - .mul_add(s, -0.365_730_738_8_e-4) - .mul_add(s, 0.249_039_358_5_e-2); + .mla(s, -0.365_730_738_8_e-4) + .mla(s, 0.249_039_358_5_e-2); let mut x = u * s + Doubled::new( -0.080_745_510_756_969_451_904, @@ -37,8 +37,8 @@ pub fn sincospif(d: f32) -> (f32, f32) { let mut rsin = if d.is_neg_zero() { -0. } else { f32::from(x) }; let u = (-0.243_061_180_1_e-7_f32) - .mul_add(s, 0.359_057_708_e-5) - .mul_add(s, -0.325_991_772_1_e-3); + .mla(s, 0.359_057_708_e-5) + .mla(s, -0.325_991_772_1_e-3); x = u * s + Doubled::new( 0.015_854_343_771_934_509_277, diff --git a/src/f32/u10.rs b/src/f32/u10.rs index b810756..e15c81a 100644 --- a/src/f32/u10.rs +++ b/src/f32/u10.rs @@ -13,7 +13,7 @@ pub fn sinf(d: f32) -> f32 { if fabsfk(d) < TRIGRANGEMAX2_F { let qf = rintfk(d * FRAC_1_PI); q = qf as i32; - let u = qf.mul_add(-PI_A2_F, d); + let u = qf.mla(-PI_A2_F, d); s = u.add_as_doubled(qf * (-PI_B2_F)); s.add_checked_assign(qf * (-PI_C2_F)); } else { @@ -35,8 +35,8 @@ pub fn sinf(d: f32) -> f32 { s = s.square(); let mut u = 2.608_315_980_978_659_354_150_3_e-6_f32 - .mul_add(s.0, -0.000_198_106_907_191_686_332_225_8) - .mul_add(s.0, 0.008_333_078_585_565_090_179_443_36); + .mla(s.0, -0.000_198_106_907_191_686_332_225_8) + .mla(s.0, 0.008_333_078_585_565_090_179_443_36); let x = (1.).add_checked((-0.166_666_597_127_914_428_710_938).add_checked_as_doubled(u * s.0) * s); @@ -68,7 +68,7 @@ pub fn cosf(mut d: f32) -> f32 { if fabsfk(d) < TRIGRANGEMAX2_F { d = fabsfk(d); - let dq = rintfk(d * FRAC_1_PI - 0.5).mul_add(2., 1.); + let dq = rintfk(d * FRAC_1_PI - 0.5).mla(2., 1.); q = dq as i32; s = d.add_as_doubled(dq * (-PI_A2_F * 0.5)); s += dq * (-PI_B2_F * 0.5); @@ -92,8 +92,8 @@ pub fn cosf(mut d: f32) -> f32 { s = s.square(); let mut u = 2.608_315_980_978_659_354_150_3_e-6_f32 - .mul_add(s.0, -0.000_198_106_907_191_686_332_225_8) - .mul_add(s.0, 0.008_333_078_585_565_090_179_443_36); + .mla(s.0, -0.000_198_106_907_191_686_332_225_8) + .mla(s.0, 0.008_333_078_585_565_090_179_443_36); let x = (1.).add_checked((-0.166_666_597_127_914_428_710_938).add_checked_as_doubled(u * s.0) * s); @@ -126,7 +126,7 @@ pub fn sincosf(d: f32) -> (f32, f32) { if fabsfk(d) < TRIGRANGEMAX2_F { let qf = rintfk(d * FRAC_2_PI); q = qf as i32; - let u = qf.mul_add(-PI_A2_F * 0.5, d); + let u = qf.mla(-PI_A2_F * 0.5, d); s = u.add_as_doubled(qf * (-PI_B2_F * 0.5)); s.add_checked_assign(qf * (-PI_C2_F * 0.5)); } else { @@ -142,8 +142,8 @@ pub fn sincosf(d: f32) -> (f32, f32) { s.0 = s.square_as_f(); let u = (-0.000_195_169_282_960_705_459_117_889_f32) - .mul_add(s.0, 0.008_332_157_507_538_795_471_191_41) - .mul_add(s.0, -0.166_666_537_523_269_653_320_312) + .mla(s.0, 0.008_332_157_507_538_795_471_191_41) + .mla(s.0, -0.166_666_537_523_269_653_320_312) * s.0 * t.0; @@ -151,10 +151,10 @@ pub fn sincosf(d: f32) -> (f32, f32) { let mut rsin = if d.is_neg_zero() { -0. } else { f32::from(x) }; let u = (-2.718_118_423_672_422_068_193_55_e-7_f32) - .mul_add(s.0, 2.479_904_469_510_074_704_885_48_e-5) - .mul_add(s.0, -0.001_388_887_874_782_085_418_701_17) - .mul_add(s.0, 0.041_666_664_183_139_801_025_390_6) - .mul_add(s.0, -0.5); + .mla(s.0, 2.479_904_469_510_074_704_885_48_e-5) + .mla(s.0, -0.001_388_887_874_782_085_418_701_17) + .mla(s.0, 0.041_666_664_183_139_801_025_390_6) + .mla(s.0, -0.5); x = (1.).add_checked(s.0.mul_as_doubled(u)); let mut rcos = f32::from(x); @@ -195,7 +195,7 @@ pub fn tanf(d: f32) -> f32 { if fabsfk(d) < TRIGRANGEMAX2_F { let qf = rintfk(d * FRAC_2_PI); q = qf as i32; - let u = qf.mul_add(-PI_A2_F * 0.5, d); + let u = qf.mla(-PI_A2_F * 0.5, d); s = u.add_as_doubled(qf * (-PI_B2_F * 0.5)); s.add_checked_assign(qf * (-PI_C2_F * 0.5)); } else { @@ -215,10 +215,10 @@ pub fn tanf(d: f32) -> f32 { s = s.square().normalize(); let u = 0.004_466_364_625_841_379_165_649_41_f32 - .mul_add(s.0, -8.392_018_207_814_544_439_315_8_e-5) - .mul_add(s.0, 0.010_963_924_229_145_050_048_828_1) - .mul_add(s.0, 0.021_236_030_384_898_185_729_980_5) - .mul_add(s.0, 0.054_068_714_380_264_282_226_562_5); + .mla(s.0, -8.392_018_207_814_544_439_315_8_e-5) + .mla(s.0, 0.010_963_924_229_145_050_048_828_1) + .mla(s.0, 0.021_236_030_384_898_185_729_980_5) + .mla(s.0, 0.054_068_714_380_264_282_226_562_5); let mut x = (0.133_325_666_189_193_725_585_938).add_checked_as_doubled(u * s.0); x = (1.).add_checked((0.333_333_611_488_342_285_156_25).add_checked(s * x) * s); @@ -259,13 +259,13 @@ fn atan2kf_u1(mut y: Doubled, mut x: Doubled) -> Doubled { let mut t = s.square().normalize(); let u = (-0.001_763_979_089_446_365_833_282_47_f32) - .mul_add(t.0, 0.010_790_090_076_625_347_137_451_2) - .mul_add(t.0, -0.030_956_460_162_997_245_788_574_2) - .mul_add(t.0, 0.057_736_508_548_259_735_107_421_9) - .mul_add(t.0, -0.083_895_072_340_965_270_996_093_8) - .mul_add(t.0, 0.109_463_557_600_975_036_621_094) - .mul_add(t.0, -0.142_626_821_994_781_494_140_625) - .mul_add(t.0, 0.199_983_194_470_405_578_613_281); + .mla(t.0, 0.010_790_090_076_625_347_137_451_2) + .mla(t.0, -0.030_956_460_162_997_245_788_574_2) + .mla(t.0, 0.057_736_508_548_259_735_107_421_9) + .mla(t.0, -0.083_895_072_340_965_270_996_093_8) + .mla(t.0, 0.109_463_557_600_975_036_621_094) + .mla(t.0, -0.142_626_821_994_781_494_140_625) + .mla(t.0, 0.199_983_194_470_405_578_613_281); t = t * (-0.333_332_866_430_282_592_773_438).add_checked_as_doubled(u * t.0); t = s * (1.).add_checked(t); @@ -352,10 +352,10 @@ pub fn asinf(d: f32) -> f32 { }; let u = 0.419_745_482_5_e-1_f32 - .mul_add(x2, 0.242_404_602_5_e-1) - .mul_add(x2, 0.454_742_386_9_e-1) - .mul_add(x2, 0.749_502_927_1_e-1) - .mul_add(x2, 0.166_667_729_6) + .mla(x2, 0.242_404_602_5_e-1) + .mla(x2, 0.454_742_386_9_e-1) + .mla(x2, 0.749_502_927_1_e-1) + .mla(x2, 0.166_667_729_6) * x2 * x.0; @@ -388,10 +388,10 @@ pub fn acosf(d: f32) -> f32 { }; let u = 0.419_745_482_5_e-1_f32 - .mul_add(x2, 0.242_404_602_5_e-1) - .mul_add(x2, 0.454_742_386_9_e-1) - .mul_add(x2, 0.749_502_927_1_e-1) - .mul_add(x2, 0.166_667_729_6) + .mla(x2, 0.242_404_602_5_e-1) + .mla(x2, 0.454_742_386_9_e-1) + .mla(x2, 0.749_502_927_1_e-1) + .mla(x2, 0.166_667_729_6) * x.0 * x2; @@ -518,9 +518,9 @@ fn logk2f(d: Doubled) -> Doubled { let x2 = x.square(); let t = 0.239_282_846_450_805_664_062_5_f32 - .mul_add(x2.0, 0.285_182_118_415_832_519_531_25) - .mul_add(x2.0, 0.400_005_877_017_974_853_515_625) - .mul_add(x2.0, 0.666_666_686_534_881_591_796_875); + .mla(x2.0, 0.285_182_118_415_832_519_531_25) + .mla(x2.0, 0.400_005_877_017_974_853_515_625) + .mla(x2.0, 0.666_666_686_534_881_591_796_875); (D_LN2 * (e as f32)) + x.scale(2.) + x2 * x * t } @@ -656,8 +656,8 @@ pub fn logf(mut d: f32) -> f32 { let x2 = x.0 * x.0; let t = 0.302_729_487_4_f32 - .mul_add(x2, 0.399_610_817_4) - .mul_add(x2, 0.666_669_488); + .mla(x2, 0.399_610_817_4) + .mla(x2, 0.666_669_488); let s = (D_LN2 * (e as f32)) .add_checked(x.scale(2.)) @@ -700,8 +700,8 @@ pub fn log10f(mut d: f32) -> f32 { let x2 = x.0 * x.0; let t = 0.131_428_986_8_f32 - .mul_add(x2, 0.173_549_354_1) - .mul_add(x2, 0.289_530_962_7); + .mla(x2, 0.173_549_354_1) + .mla(x2, 0.289_530_962_7); let s = (Doubled::new(0.301_030_01, -1.432_098_889_e-8) * (e as f32)) .add_checked(x * Doubled::new(0.868_588_984, -2.170_757_285_e-8)) @@ -744,8 +744,8 @@ pub fn log2f(mut d: f32) -> f32 { let x2 = x.0 * x.0; let t = 0.437_455_028_3_f32 - .mul_add(x2, 0.576_479_017_7) - .mul_add(x2, 0.961_801_290_512); + .mla(x2, 0.576_479_017_7) + .mla(x2, 0.961_801_290_512); let mut s = (e as f32) + x * Doubled::new(2.885_390_043_258_666_992_2, 3.273_447_448_356_848_861_6_e-8); @@ -782,7 +782,7 @@ pub fn log1pf(d: f32) -> f32 { let mut e = ilogb2kf(dp1 * (1. / 0.75)); let t = ldexp3kf(1., -e); - let m = d.mul_add(t, t - 1.); + let m = d.mla(t, t - 1.); if o { e -= 64; @@ -792,8 +792,8 @@ pub fn log1pf(d: f32) -> f32 { let x2 = x.0 * x.0; let t = 0.302_729_487_4_f32 - .mul_add(x2, 0.399_610_817_4) - .mul_add(x2, 0.666_669_488); + .mla(x2, 0.399_610_817_4) + .mla(x2, 0.666_669_488); let s = (crate::f32::D_LN2 * (e as f32)) .add_checked(x.scale(2.)) @@ -824,15 +824,15 @@ fn test_log1pf() { pub fn expf(d: f32) -> f32 { let qf = rintfk(d * R_LN2_F); let q = qf as i32; - let s = qf.mul_add(-L2U_F, d); - let s = qf.mul_add(-L2L_F, s); + let s = qf.mla(-L2U_F, d); + let s = qf.mla(-L2L_F, s); let mut u = 0.000_198_527_617_612_853_646_278_381_f32 - .mul_add(s, 0.001_393_043_552_525_341_510_772_71) - .mul_add(s, 0.008_333_360_776_305_198_669_433_59) - .mul_add(s, 0.041_666_485_369_205_474_853_515_6) - .mul_add(s, 0.166_666_671_633_720_397_949_219) - .mul_add(s, 0.5); + .mla(s, 0.001_393_043_552_525_341_510_772_71) + .mla(s, 0.008_333_360_776_305_198_669_433_59) + .mla(s, 0.041_666_485_369_205_474_853_515_6) + .mla(s, 0.166_666_671_633_720_397_949_219) + .mla(s, 0.5); u = s * s * u + s + 1.; @@ -858,15 +858,15 @@ pub fn exp10f(d: f32) -> f32 { let qf = rintfk(d * LOG10_2_F); let q = qf as i32; - let s = qf.mul_add(-L10U_F, d); - let s = qf.mul_add(-L10L_F, s); + let s = qf.mla(-L10U_F, d); + let s = qf.mla(-L10L_F, s); let mut u = 0.680_255_591_9_e-1 - .mul_add(s, 0.207_808_032_6) - .mul_add(s, 0.539_390_385_2) - .mul_add(s, 0.117_124_533_7_e+1) - .mul_add(s, 0.203_467_869_8_e+1) - .mul_add(s, 0.265_094_900_1_e+1); + .mla(s, 0.207_808_032_6) + .mla(s, 0.539_390_385_2) + .mla(s, 0.117_124_533_7_e+1) + .mla(s, 0.203_467_869_8_e+1) + .mla(s, 0.265_094_900_1_e+1); let x = Doubled::new(2.3025851249694824219, -3.1705172516493593157e-08).add_checked(u * s); u = (1.).add_checked(x * s).normalize().0; @@ -916,11 +916,11 @@ pub fn exp2f(d: f32) -> f32 { let s = d - qf; let mut u = 0.153_592_089_2_e-3_f32 - .mul_add(s, 0.133_926_270_1_e-2) - .mul_add(s, 0.961_838_476_4_e-2) - .mul_add(s, 0.555_034_726_9_e-1) - .mul_add(s, 0.240_226_447_6) - .mul_add(s, 0.693_147_182_5); + .mla(s, 0.133_926_270_1_e-2) + .mla(s, 0.961_838_476_4_e-2) + .mla(s, 0.555_034_726_9_e-1) + .mla(s, 0.240_226_447_6) + .mla(s, 0.693_147_182_5); u = (1.).add_checked(u.mul_as_doubled(s)).normalize().0; if d >= 128. { @@ -955,8 +955,8 @@ fn logkf(mut d: f32) -> Doubled { let x2 = x.square(); let t = 0.240_320_354_700_088_500_976_562_f32 - .mul_add(x2.0, 0.285_112_679_004_669_189_453_125) - .mul_add(x2.0, 0.400_007_992_982_864_379_882_812); + .mla(x2.0, 0.285_112_679_004_669_189_453_125) + .mla(x2.0, 0.400_007_992_982_864_379_882_812); let c = Doubled::new( 0.666_666_626_930_236_816_406_25, 3.691_838_612_596_143_320_843_11_e-9, @@ -978,10 +978,10 @@ fn expkf(d: Doubled) -> f32 { s = s.normalize(); let u = 0.001_363_246_468_827_128_410_339_36_f32 - .mul_add(s.0, 0.008_365_969_173_610_210_418_701_17) - .mul_add(s.0, 0.041_671_082_377_433_776_855_468_8) - .mul_add(s.0, 0.166_665_524_244_308_471_679_688) - .mul_add(s.0, 0.499_999_850_988_388_061_523_438); + .mla(s.0, 0.008_365_969_173_610_210_418_701_17) + .mla(s.0, 0.041_671_082_377_433_776_855_468_8) + .mla(s.0, 0.166_665_524_244_308_471_679_688) + .mla(s.0, 0.499_999_850_988_388_061_523_438); let mut t = s.add_checked(s.square() * u); @@ -1082,11 +1082,11 @@ pub fn cbrtf(mut d: f32) -> f32 { d = fabsfk(d); let mut x = (-0.601_564_466_953_277_587_890_625_f32) - .mul_add(d, 2.820_889_234_542_846_679_687_5) - .mul_add(d, -5.532_182_216_644_287_109_375) - .mul_add(d, 5.898_262_500_762_939_453_125) - .mul_add(d, -3.809_541_702_270_507_812_5) - .mul_add(d, 2.224_125_623_703_002_929_687_5); + .mla(d, 2.820_889_234_542_846_679_687_5) + .mla(d, -5.532_182_216_644_287_109_375) + .mla(d, 5.898_262_500_762_939_453_125) + .mla(d, -3.809_541_702_270_507_812_5) + .mla(d, 2.224_125_623_703_002_929_687_5); let mut y = x * x; y = y * y; @@ -1153,7 +1153,7 @@ fn gammafk(a: f32) -> (Doubled, Doubled) { } else { 0.110_248_955_e-3 }) - .mul_add( + .mla( t, if o2 { -5.171_790_908_260_592_193_293_944_22_e-5 @@ -1163,7 +1163,7 @@ fn gammafk(a: f32) -> (Doubled, Doubled) { 0.816_001_993_4_e-4 }, ) - .mul_add( + .mla( t, if o2 { -0.000_592_166_437_353_693_882_857_342_347 @@ -1173,7 +1173,7 @@ fn gammafk(a: f32) -> (Doubled, Doubled) { 0.152_846_885_6_e-3 }, ) - .mul_add( + .mla( t, if o2 { 6.972_813_758_365_857_774_037_435_39_e-5 @@ -1183,7 +1183,7 @@ fn gammafk(a: f32) -> (Doubled, Doubled) { -0.235_506_871_8_e-3 }, ) - .mul_add( + .mla( t, if o2 { 0.000_784_039_221_720_066_627_493_314_301 @@ -1193,7 +1193,7 @@ fn gammafk(a: f32) -> (Doubled, Doubled) { 0.496_224_209_2_e-3 }, ) - .mul_add( + .mla( t, if o2 { -0.000_229_472_093_621_399_176_949_318_732 @@ -1203,7 +1203,7 @@ fn gammafk(a: f32) -> (Doubled, Doubled) { -0.119_348_801_7_e-2 }, ) - .mul_add( + .mla( t, if o2 { -0.002_681_327_160_493_827_160_473_958_490 @@ -1213,7 +1213,7 @@ fn gammafk(a: f32) -> (Doubled, Doubled) { 0.289_159_943_3_e-2 }, ) - .mul_add( + .mla( t, if o2 { 0.003_472_222_222_222_222_222_175_164_840 @@ -1223,7 +1223,7 @@ fn gammafk(a: f32) -> (Doubled, Doubled) { -0.738_545_181_2_e-2 }, ) - .mul_add( + .mla( t, if o2 { 0.083_333_333_333_333_333_335_592_087_900 diff --git a/src/f32/u15.rs b/src/f32/u15.rs index 18e3932..f43c276 100644 --- a/src/f32/u15.rs +++ b/src/f32/u15.rs @@ -27,7 +27,7 @@ pub fn erfcf(mut a: f32) -> f32 { } else { 0.111_534_416_7_e+1 } - .mul_add( + .mla( u.0, if o0 { 0.600_016_617_7_e-3 @@ -39,7 +39,7 @@ pub fn erfcf(mut a: f32) -> f32 { -0.945_490_419_9 }, ) - .mul_add( + .mla( u.0, if o0 { -0.166_570_360_3_e-2 @@ -51,7 +51,7 @@ pub fn erfcf(mut a: f32) -> f32 { -0.366_725_951_4 }, ) - .mul_add( + .mla( u.0, if o0 { 0.179_515_627_7_e-3 @@ -63,7 +63,7 @@ pub fn erfcf(mut a: f32) -> f32 { 0.715_566_337_1 }, ) - .mul_add( + .mla( u.0, if o0 { 0.191_410_612_3_e-1 diff --git a/src/f32/u35.rs b/src/f32/u35.rs index 4465f44..32f7ba0 100644 --- a/src/f32/u35.rs +++ b/src/f32/u35.rs @@ -13,16 +13,16 @@ pub fn sinf(mut d: f32) -> f32 { if fabsfk(d) < TRIGRANGEMAX2_F { let qf = rintfk(d * FRAC_1_PI); q = qf as i32; - d = qf.mul_add(-PI_A2_F, d); - d = qf.mul_add(-PI_B2_F, d); - d = qf.mul_add(-PI_C2_F, d); + d = qf.mla(-PI_A2_F, d); + d = qf.mla(-PI_B2_F, d); + d = qf.mla(-PI_C2_F, d); } else if fabsfk(d) < TRIGRANGEMAX_F { let qf = rintfk(d * FRAC_1_PI); q = qf as i32; - d = qf.mul_add(-PI_A_F, d); - d = qf.mul_add(-PI_B_F, d); - d = qf.mul_add(-PI_C_F, d); - d = qf.mul_add(-PI_D_F, d); + d = qf.mla(-PI_A_F, d); + d = qf.mla(-PI_B_F, d); + d = qf.mla(-PI_C_F, d); + d = qf.mla(-PI_D_F, d); } else { let (mut dfidf, dfii) = rempif(t); q = ((dfii & 3) * 2 + ((dfidf.0 > 0.) as i32) + 1) >> 2; @@ -45,14 +45,14 @@ pub fn sinf(mut d: f32) -> f32 { } let u = 2.608_315_980_978_659_354_150_3_e-6_f32 - .mul_add(s, -0.000_198_106_907_191_686_332_225_8) - .mul_add(s, 0.008_333_078_585_565_090_179_443_36) - .mul_add(s, -0.166_666_597_127_914_428_710_938); + .mla(s, -0.000_198_106_907_191_686_332_225_8) + .mla(s, 0.008_333_078_585_565_090_179_443_36) + .mla(s, -0.166_666_597_127_914_428_710_938); if t.is_neg_zero() { -0. } else { - s.mul_add(u * d, d) + s.mla(u * d, d) } } @@ -72,16 +72,16 @@ pub fn cosf(mut d: f32) -> f32 { if fabsfk(d) < TRIGRANGEMAX2_F { q = 1 + 2 * (rintfk(d * FRAC_1_PI - 0.5) as i32); let qf = q as f32; - d = qf.mul_add(-PI_A2_F * 0.5, d); - d = qf.mul_add(-PI_B2_F * 0.5, d); - d = qf.mul_add(-PI_C2_F * 0.5, d); + d = qf.mla(-PI_A2_F * 0.5, d); + d = qf.mla(-PI_B2_F * 0.5, d); + d = qf.mla(-PI_C2_F * 0.5, d); } else if fabsfk(d) < TRIGRANGEMAX_F { q = 1 + 2 * (rintfk(d * FRAC_1_PI - 0.5) as i32); let qf = q as f32; - d = qf.mul_add(-PI_A_F * 0.5, d); - d = qf.mul_add(-PI_B_F * 0.5, d); - d = qf.mul_add(-PI_C_F * 0.5, d); - d = qf.mul_add(-PI_D_F * 0.5, d); + d = qf.mla(-PI_A_F * 0.5, d); + d = qf.mla(-PI_B_F * 0.5, d); + d = qf.mla(-PI_C_F * 0.5, d); + d = qf.mla(-PI_D_F * 0.5, d); } else { let (mut dfidf, dfii) = rempif(t); q = ((dfii & 3) * 2 + ((dfidf.0 > 0.) as i32) + 7) >> 1; @@ -104,11 +104,11 @@ pub fn cosf(mut d: f32) -> f32 { } let u = 2.608_315_980_978_659_354_150_3_e-6_f32 - .mul_add(s, -0.000_198_106_907_191_686_332_225_8) - .mul_add(s, 0.008_333_078_585_565_090_179_443_36) - .mul_add(s, -0.166_666_597_127_914_428_710_938); + .mla(s, -0.000_198_106_907_191_686_332_225_8) + .mla(s, 0.008_333_078_585_565_090_179_443_36) + .mla(s, -0.166_666_597_127_914_428_710_938); - s.mul_add(u * d, d) + s.mla(u * d, d) } #[test] @@ -130,16 +130,16 @@ pub fn sincosf(d: f32) -> (f32, f32) { if fabsfk(d) < TRIGRANGEMAX2_F { let qf = rintfk(d * FRAC_2_PI); q = qf as i32; - s = qf.mul_add(-PI_A2_F * 0.5, s); - s = qf.mul_add(-PI_B2_F * 0.5, s); - s = qf.mul_add(-PI_C2_F * 0.5, s); + s = qf.mla(-PI_A2_F * 0.5, s); + s = qf.mla(-PI_B2_F * 0.5, s); + s = qf.mla(-PI_C2_F * 0.5, s); } else if fabsfk(d) < TRIGRANGEMAX_F { let qf = rintfk(d * FRAC_2_PI); q = qf as i32; - s = qf.mul_add(-PI_A_F * 0.5, s); - s = qf.mul_add(-PI_B_F * 0.5, s); - s = qf.mul_add(-PI_C_F * 0.5, s); - s = qf.mul_add(-PI_D_F * 0.5, s); + s = qf.mla(-PI_A_F * 0.5, s); + s = qf.mla(-PI_B_F * 0.5, s); + s = qf.mla(-PI_C_F * 0.5, s); + s = qf.mla(-PI_D_F * 0.5, s); } else { let (dfidf, dfii) = rempif(d); q = dfii; @@ -154,17 +154,17 @@ pub fn sincosf(d: f32) -> (f32, f32) { s = s * s; let mut u = (-0.000_195_169_282_960_705_459_117_889_f32) - .mul_add(s, 0.008_332_157_507_538_795_471_191_41) - .mul_add(s, -0.166_666_537_523_269_653_320_312); + .mla(s, 0.008_332_157_507_538_795_471_191_41) + .mla(s, -0.166_666_537_523_269_653_320_312); u = u * s * t; let mut rsin = if d.is_neg_zero() { -0. } else { t + u }; u = (-2.718_118_423_672_422_068_193_55_e-7_f32) - .mul_add(s, 2.479_904_469_510_074_704_885_48_e-5) - .mul_add(s, -0.001_388_887_874_782_085_418_701_17) - .mul_add(s, 0.041_666_664_183_139_801_025_390_6) - .mul_add(s, -0.5); + .mla(s, 2.479_904_469_510_074_704_885_48_e-5) + .mla(s, -0.001_388_887_874_782_085_418_701_17) + .mla(s, 0.041_666_664_183_139_801_025_390_6) + .mla(s, -0.5); let mut rcos = u * s + 1.; @@ -206,16 +206,16 @@ pub fn tanf(d: f32) -> f32 { if fabsfk(d) < TRIGRANGEMAX2_F * 0.5 { let qf = rintfk(d * FRAC_2_PI); q = qf as i32; - x = qf.mul_add(-PI_A2_F * 0.5, x); - x = qf.mul_add(-PI_B2_F * 0.5, x); - x = qf.mul_add(-PI_C2_F * 0.5, x); + x = qf.mla(-PI_A2_F * 0.5, x); + x = qf.mla(-PI_B2_F * 0.5, x); + x = qf.mla(-PI_C2_F * 0.5, x); } else if fabsfk(d) < TRIGRANGEMAX_F { let qf = rintfk(d * FRAC_2_PI); q = qf as i32; - x = qf.mul_add(-PI_A_F * 0.5, x); - x = qf.mul_add(-PI_B_F * 0.5, x); - x = qf.mul_add(-PI_C_F * 0.5, x); - x = qf.mul_add(-PI_D_F * 0.5, x); + x = qf.mla(-PI_A_F * 0.5, x); + x = qf.mla(-PI_B_F * 0.5, x); + x = qf.mla(-PI_C_F * 0.5, x); + x = qf.mla(-PI_D_F * 0.5, x); } else { let (dfidf, dfii) = rempif(d); q = dfii; @@ -246,7 +246,7 @@ pub fn tanf(d: f32) -> f32 { 0.333_331_853_151_321_411_132_812, ); - u = s.mul_add(u * x, x); + u = s.mla(u * x, x); if (q & 1) != 0 { 1. / u @@ -276,16 +276,16 @@ pub fn sincospif(d: f32) -> (f32, f32) { let s = s * s; let mut rsin = (-0.360_092_526_5_e-4_f32) - .mul_add(s, 0.249_008_811_1_e-2) - .mul_add(s, -0.807_455_107_6_e-1) - .mul_add(s, 0.785_398_185_3) + .mla(s, 0.249_008_811_1_e-2) + .mla(s, -0.807_455_107_6_e-1) + .mla(s, 0.785_398_185_3) * t; let mut rcos = 0.353_981_522_5_e-5_f32 - .mul_add(s, -0.325_957_400_5_e-3) - .mul_add(s, 0.158_543_158_3_e-1) - .mul_add(s, -0.308_425_128_5) - .mul_add(s, 1.); + .mla(s, -0.325_957_400_5_e-3) + .mla(s, 0.158_543_158_3_e-1) + .mla(s, -0.308_425_128_5) + .mla(s, 1.); if (q & 2) != 0 { core::mem::swap(&mut rcos, &mut rsin); @@ -424,11 +424,11 @@ pub fn asinf(d: f32) -> f32 { let x = if o { fabsfk(d) } else { x2.sqrt() }; let u = 0.419_745_482_5_e-1_f32 - .mul_add(x2, 0.242_404_602_5_e-1) - .mul_add(x2, 0.454_742_386_9_e-1) - .mul_add(x2, 0.749_502_927_1_e-1) - .mul_add(x2, 0.166_667_729_6) - .mul_add(x * x2, x); + .mla(x2, 0.242_404_602_5_e-1) + .mla(x2, 0.454_742_386_9_e-1) + .mla(x2, 0.749_502_927_1_e-1) + .mla(x2, 0.166_667_729_6) + .mla(x * x2, x); let r = if o { u } else { FRAC_PI_2 - 2. * u }; r.mul_sign(d) @@ -450,10 +450,10 @@ pub fn acosf(d: f32) -> f32 { x = if fabsfk(d) == 1. { 0. } else { x }; let mut u = 0.419_745_482_5_e-1_f32 - .mul_add(x2, 0.242_404_602_5_e-1) - .mul_add(x2, 0.454_742_386_9_e-1) - .mul_add(x2, 0.749_502_927_1_e-1) - .mul_add(x2, 0.166_667_729_6); + .mla(x2, 0.242_404_602_5_e-1) + .mla(x2, 0.454_742_386_9_e-1) + .mla(x2, 0.749_502_927_1_e-1) + .mla(x2, 0.166_667_729_6); u *= x * x2; @@ -529,8 +529,8 @@ fn expm1kf(d: f32) -> f32 { let qf = rintfk(d * R_LN2_F); let q = qf as i32; - let s = qf.mul_add(-L2U_F, d); - let s = qf.mul_add(-L2L_F, s); + let s = qf.mla(-L2U_F, d); + let s = qf.mla(-L2L_F, s); let s2 = s * s; let s4 = s2 * s2; @@ -650,10 +650,10 @@ pub fn logf(mut d: f32) -> f32 { let x2 = x * x; let t = 0.239_282_846_450_805_664_062_5_f32 - .mul_add(x2, 0.285_182_118_415_832_519_531_25) - .mul_add(x2, 0.400_005_877_017_974_853_515_625) - .mul_add(x2, 0.666_666_686_534_881_591_796_875) - .mul_add(x2, 2.); + .mla(x2, 0.285_182_118_415_832_519_531_25) + .mla(x2, 0.400_005_877_017_974_853_515_625) + .mla(x2, 0.666_666_686_534_881_591_796_875) + .mla(x2, 2.); if d == 0. { f32::NEG_INFINITY @@ -691,10 +691,10 @@ pub fn log2f(mut d: f32) -> f32 { let x2 = x * x; let t = 0.437_408_834_7 - .mul_add(x2, 0.576_484_382_2) - .mul_add(x2, 0.961_802_423); + .mla(x2, 0.576_484_382_2) + .mla(x2, 0.961_802_423); - let r = (x2 * x).mul_add(t, x.mul_add(0.288_539_004_3_e+1, e as f32)); + let r = (x2 * x).mla(t, x.mla(0.288_539_004_3_e+1, e as f32)); if d == 0. { f32::NEG_INFINITY @@ -718,16 +718,16 @@ fn test_log2f() { pub fn exp10f(d: f32) -> f32 { let q = rintfk(d * LOG10_2_F); - let mut s = q.mul_add(-L10U_F, d); - s = q.mul_add(-L10L_F, s); + let mut s = q.mla(-L10U_F, d); + s = q.mla(-L10L_F, s); let mut u = 0.206_400_498_7 - .mul_add(s, 0.541_787_743_6) - .mul_add(s, 0.117_128_682_1_e+1) - .mul_add(s, 0.203_465_604_8_e+1) - .mul_add(s, 0.265_094_876_3_e+1) - .mul_add(s, 0.230_258_512_5_e+1) - .mul_add(s, 0.1_e+1); + .mla(s, 0.541_787_743_6) + .mla(s, 0.117_128_682_1_e+1) + .mla(s, 0.203_465_604_8_e+1) + .mla(s, 0.265_094_876_3_e+1) + .mla(s, 0.230_258_512_5_e+1) + .mla(s, 0.1_e+1); u = ldexp2kf(u, q as i32); @@ -754,12 +754,12 @@ pub fn exp2f(d: f32) -> f32 { let s = d - q; let mut u = 0.153_592_089_2_e-3 - .mul_add(s, 0.133_926_270_1_e-2) - .mul_add(s, 0.961_838_476_4_e-2) - .mul_add(s, 0.555_034_726_9_e-1) - .mul_add(s, 0.240_226_447_6) - .mul_add(s, 0.693_147_182_5) - .mul_add(s, 0.1_e+1); + .mla(s, 0.133_926_270_1_e-2) + .mla(s, 0.961_838_476_4_e-2) + .mla(s, 0.555_034_726_9_e-1) + .mla(s, 0.240_226_447_6) + .mla(s, 0.693_147_182_5) + .mla(s, 0.1_e+1); u = ldexp2kf(u, q as i32); @@ -839,11 +839,11 @@ pub fn cbrtf(mut d: f32) -> f32 { d = fabsfk(d); let x = (-0.601_564_466_953_277_587_890_625_f32) - .mul_add(d, 2.820_889_234_542_846_679_687_5) - .mul_add(d, -5.532_182_216_644_287_109_375) - .mul_add(d, 5.898_262_500_762_939_453_125) - .mul_add(d, -3.809_541_702_270_507_812_5) - .mul_add(d, 2.224_125_623_703_002_929_687_5); + .mla(d, 2.820_889_234_542_846_679_687_5) + .mla(d, -5.532_182_216_644_287_109_375) + .mla(d, 5.898_262_500_762_939_453_125) + .mla(d, -3.809_541_702_270_507_812_5) + .mla(d, 2.224_125_623_703_002_929_687_5); let y = d * x * x; (y - (2. / 3.) * y * (y * x - 1.)) * q diff --git a/src/f32x.rs b/src/f32x.rs index 69ad49a..a54e0fe 100644 --- a/src/f32x.rs +++ b/src/f32x.rs @@ -552,10 +552,10 @@ macro_rules! impl_math_f32 { impl MulAdd for F32x { #[inline] - fn mul_add(self, y: Self, z: Self) -> Self { + fn mla(self, y: Self, z: Self) -> Self { if cfg!(target_feature = "fma") { use std::simd::{StdFloat}; - ::mul_add(self, y, z) + self.mul_add(y, z) } else { self * y + z } @@ -567,7 +567,7 @@ macro_rules! impl_math_f32 { fn mul_sub(self, y: Self, z: Self) -> Self { if cfg!(target_feature = "fma") { use std::simd::{StdFloat}; - ::mul_add(self, y, -z) + self.mul_add(y, -z) } else { self * y - z } @@ -579,7 +579,7 @@ macro_rules! impl_math_f32 { fn neg_mul_add(self, y: Self, z: Self) -> Self { if cfg!(target_feature = "fma") { use std::simd::{StdFloat}; - ::mul_add(-self, y, z) + (-self).mul_add(y, z) } else { -self * y + z } @@ -777,12 +777,12 @@ macro_rules! impl_math_f32 { let c = F1_23X.mul_sign(x); let rint4x = (F32x::splat(4.) * x).abs().simd_gt(F1_23X).select( (F32x::splat(4.) * x), - (F32x::splat(4.).mul_add(x, c) - c).or_sign(x) + (F32x::splat(4.).mla(x, c) - c).or_sign(x) ); let rintx = x.abs().simd_gt(F1_23X).select(x, ((x + c) - c).or_sign(x)); - let fr = F32x::splat(-0.25).mul_add(rint4x, x); - let vi = F32x::splat(-4.).mul_add(rintx, rint4x).trunci(); + let fr = F32x::splat(-0.25).mla(rint4x, x); + let vi = F32x::splat(-4.).mla(rintx, rint4x).trunci(); (fr, vi) } } @@ -847,9 +847,9 @@ macro_rules! impl_math_f32 { s += q.cast::() * (-L2L_F); let u = F32x::splat(0.198_096_022_4_e-3) - .mul_add(s.0, F32x::splat(0.139_425_648_4_e-2)) - .mul_add(s.0, F32x::splat(0.833_345_670_3_e-2)) - .mul_add(s.0, F32x::splat(0.416_663_736_1_e-1)); + .mla(s.0, F32x::splat(0.139_425_648_4_e-2)) + .mla(s.0, F32x::splat(0.833_345_670_3_e-2)) + .mla(s.0, F32x::splat(0.416_663_736_1_e-1)); let mut t = s * u + F32x::splat(0.166_666_659_414_234_244_790_680_580_464); t = s * t + HALF; @@ -1206,8 +1206,8 @@ macro_rules! impl_math_f32 { let u = o .select_splat(-0.243_061_180_1_e-7, 0.309_384_205_4_e-6) - .mul_add(s, o.select_splat(0.359_057_708_e-5, -0.365_730_738_8_e-4)) - .mul_add(s, o.select_splat(-0.325_991_772_1_e-3, 0.249_039_358_5_e-2)); + .mla(s, o.select_splat(0.359_057_708_e-5, -0.365_730_738_8_e-4)) + .mla(s, o.select_splat(-0.325_991_772_1_e-3, 0.249_039_358_5_e-2)); let mut x = u * s + o.select_doubled( Doubled::new( @@ -1259,8 +1259,8 @@ macro_rules! impl_math_f32 { let u = o .select_splat(-0.243_061_180_1_e-7, 0.309_384_205_4_e-6) - .mul_add(s, o.select_splat(0.359_057_708_e-5, -0.365_730_738_8_e-4)) - .mul_add(s, o.select_splat(-0.325_991_772_1_e-3, 0.249_039_358_5_e-2)); + .mla(s, o.select_splat(0.359_057_708_e-5, -0.365_730_738_8_e-4)) + .mla(s, o.select_splat(-0.325_991_772_1_e-3, 0.249_039_358_5_e-2)); let mut x = u * s + o.select_doubled( Doubled::new( @@ -1299,8 +1299,8 @@ macro_rules! impl_math_f32 { #[inline] fn expm1fk(d: F32x) -> F32x { let q = (d * R_LN2_F).roundi(); - let s = q.cast::().mul_add(-L2U_F, d); - let s = q.cast::().mul_add(-L2L_F, s); + let s = q.cast::().mla(-L2U_F, d); + let s = q.cast::().mla(-L2L_F, s); let s2 = s * s; let s4 = s2 * s2; @@ -1313,7 +1313,7 @@ macro_rules! impl_math_f32 { 0.166_666_671_633_720_397_949_219, 0.5); - let u = (s * s).mul_add(u, s); + let u = (s * s).mla(u, s); q.simd_eq(I32x::splat(0)) .select(u, ldexp2kf(u + ONE, q) - ONE) diff --git a/src/f32x/fast_impl.rs b/src/f32x/fast_impl.rs index 2b7ac6c..1945829 100644 --- a/src/f32x/fast_impl.rs +++ b/src/f32x/fast_impl.rs @@ -11,14 +11,14 @@ macro_rules! impl_math_f32_fast { let s = d * FRAC_1_PI; let mut u = s.round(); let q = s.roundi(); - d = u.mul_add(-PI, d); + d = u.mla(-PI, d); let s = d * d; u = F32x::splat(-0.188_174_817_6_e-3) - .mul_add(s, F32x::splat(0.832_350_272_7_e-2)) - .mul_add(s, F32x::splat(-0.166_665_136_8)); - u = (s * d).mul_add(u, d); + .mla(s, F32x::splat(0.832_350_272_7_e-2)) + .mla(s, F32x::splat(-0.166_665_136_8)); + u = (s * d).mla(u, d); u = F32x::from_bits( ((q & I32x::splat(1)).simd_eq(I32x::splat(1)).to_int().cast() & (-ZERO).to_bits()) @@ -50,17 +50,17 @@ macro_rules! impl_math_f32_fast { pub fn cosf(mut d: F32x) -> F32x { let t = d; - let s = d.mul_add(FRAC_1_PI, -HALF); + let s = d.mla(FRAC_1_PI, -HALF); let mut u = s.round(); let q = s.roundi(); - d = u.mul_add(-PI, d - FRAC_PI_2); + d = u.mla(-PI, d - FRAC_PI_2); let s = d * d; u = F32x::splat(-0.188_174_817_6_e-3) - .mul_add(s, F32x::splat(0.832_350_272_7_e-2)) - .mul_add(s, F32x::splat(-0.166_665_136_8)); - u = (s * d).mul_add(u, d); + .mla(s, F32x::splat(0.832_350_272_7_e-2)) + .mla(s, F32x::splat(-0.166_665_136_8)); + u = (s * d).mla(u, d); u = F32x::from_bits( ((q & I32x::splat(1)).simd_eq(I32x::splat(0)).to_int().cast() & (-ZERO).to_bits()) @@ -103,18 +103,18 @@ macro_rules! impl_math_f32_fast { let x2 = x * x; let t = F32x::splat(0.239_282_846_450_805_664_062_5) - .mul_add(x2, F32x::splat(0.285_182_118_415_832_519_531_25)) - .mul_add(x2, F32x::splat(0.400_005_877_017_974_853_515_625)) - .mul_add(x2, F32x::splat(0.666_666_686_534_881_591_796_875)) - .mul_add(x2, F32x::splat(2.)); + .mla(x2, F32x::splat(0.285_182_118_415_832_519_531_25)) + .mla(x2, F32x::splat(0.400_005_877_017_974_853_515_625)) + .mla(x2, F32x::splat(0.666_666_686_534_881_591_796_875)) + .mla(x2, F32x::splat(2.)); //if !cfg!(feature = "enable_avx512f") && !cfg!(feature = "enable_avx512fnofma") { - x.mul_add( + x.mla( t, F32x::splat(0.693_147_180_559_945_286_226_764) * e.cast(), ) /* } else { - x.mul_add(t, F32x::splat(0.693_147_180_559_945_286_226_764) * e) + x.mla(t, F32x::splat(0.693_147_180_559_945_286_226_764) * e) }*/ } @@ -122,17 +122,17 @@ macro_rules! impl_math_f32_fast { fn expk3f(d: F32x) -> F32x { let q = (d * R_LN2_F).roundi(); - let mut s = q.cast::().mul_add(-L2U_F, d); - s = q.cast::().mul_add(-L2L_F, s); + let mut s = q.cast::().mla(-L2U_F, d); + s = q.cast::().mla(-L2L_F, s); let mut u = F32x::splat(0.000_198_527_617_612_853_646_278_381) - .mul_add(s, F32x::splat(0.001_393_043_552_525_341_510_772_71)) - .mul_add(s, F32x::splat(0.008_333_360_776_305_198_669_433_59)) - .mul_add(s, F32x::splat(0.041_666_485_369_205_474_853_515_6)) - .mul_add(s, F32x::splat(0.166_666_671_633_720_397_949_219)) - .mul_add(s, HALF); + .mla(s, F32x::splat(0.001_393_043_552_525_341_510_772_71)) + .mla(s, F32x::splat(0.008_333_360_776_305_198_669_433_59)) + .mla(s, F32x::splat(0.041_666_485_369_205_474_853_515_6)) + .mla(s, F32x::splat(0.166_666_671_633_720_397_949_219)) + .mla(s, HALF); - u = (s * s).mul_add(u, s + ONE); + u = (s * s).mla(u, s + ONE); u = ldexp2kf(u, q); F32x::from_bits(!d.simd_lt(F32x::splat(-104.)).to_int().cast::() & u.to_bits()) diff --git a/src/f32x/u05_impl.rs b/src/f32x/u05_impl.rs index db4c175..4e0a91a 100644 --- a/src/f32x/u05_impl.rs +++ b/src/f32x/u05_impl.rs @@ -19,8 +19,8 @@ macro_rules! impl_math_f32_u05 { let s2 = t.mul_as_doubled(t); let u = F32x::splat(0.309_384_205_4_e-6) - .mul_add(s, F32x::splat(-0.365_730_738_8_e-4)) - .mul_add(s, F32x::splat(0.249_039_358_5_e-2)); + .mla(s, F32x::splat(-0.365_730_738_8_e-4)) + .mla(s, F32x::splat(0.249_039_358_5_e-2)); let mut x = u * s + Doubled::new( F32x::splat(-0.080_745_510_756_969_451_904), @@ -38,8 +38,8 @@ macro_rules! impl_math_f32_u05 { let rx = d.is_neg_zero().select(NEG_ZERO, rx); let u = F32x::splat(-0.243_061_180_1_e-7) - .mul_add(s, F32x::splat(0.359_057_708_e-5)) - .mul_add(s, F32x::splat(-0.325_991_772_1_e-3)); + .mla(s, F32x::splat(0.359_057_708_e-5)) + .mla(s, F32x::splat(-0.325_991_772_1_e-3)); x = u * s + Doubled::new( F32x::splat(0.015_854_343_771_934_509_277), diff --git a/src/f32x/u10_impl.rs b/src/f32x/u10_impl.rs index 64882c8..9ce50fc 100644 --- a/src/f32x/u10_impl.rs +++ b/src/f32x/u10_impl.rs @@ -13,7 +13,7 @@ macro_rules! impl_math_f32_u10 { if d.abs().simd_lt(TRIGRANGEMAX2_F).all() { let u = (d * FRAC_1_PI).round(); q = u.roundi(); - let v = u.mul_add(-PI_A2_F, d); + let v = u.mla(-PI_A2_F, d); s = v.add_as_doubled(u * (-PI_B2_F)); s = s.add_checked(u * (-PI_C2_F)); } else { @@ -39,8 +39,8 @@ macro_rules! impl_math_f32_u10 { let s = s.square(); let mut u = F32x::splat(2.608_315_980_978_659_354_150_3_e-6) - .mul_add(s.0, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) - .mul_add(s.0, F32x::splat(0.008_333_078_585_565_090_179_443_36)); + .mla(s.0, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) + .mla(s.0, F32x::splat(0.008_333_078_585_565_090_179_443_36)); let x = ONE.add_checked( F32x::splat(-0.166_666_597_127_914_428_710_938).add_checked_as_doubled(u * s.0) * s, @@ -66,7 +66,7 @@ macro_rules! impl_math_f32_u10 { pub fn sinf_deterministic(d: F32x) -> F32x { let u = (d * FRAC_1_PI).round(); let mut q = u.roundi(); - let v = u.mul_add((-PI_A2_F), d); + let v = u.mla((-PI_A2_F), d); let mut s = v.add_as_doubled(u * (-PI_B2_F)); s = s.add_checked(u * (-PI_C2_F)); let g = d.abs().simd_lt(TRIGRANGEMAX2_F); @@ -97,8 +97,8 @@ macro_rules! impl_math_f32_u10 { s = s.square(); let mut u = F32x::splat(2.608_315_980_978_659_354_150_3_e-6) - .mul_add(s.0, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) - .mul_add(s.0, F32x::splat(0.008_333_078_585_565_090_179_443_36)); + .mla(s.0, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) + .mla(s.0, F32x::splat(0.008_333_078_585_565_090_179_443_36)); let x = ONE.add_checked( F32x::splat(-0.166_666_597_127_914_428_710_938).add_checked_as_doubled(u * s.0) * s, @@ -140,9 +140,9 @@ macro_rules! impl_math_f32_u10 { let mut s: Doubled; if d.abs().simd_lt(TRIGRANGEMAX2_F).all() { - let dq = (d.mul_add(FRAC_1_PI, F32x::splat(-0.5))) + let dq = (d.mla(FRAC_1_PI, F32x::splat(-0.5))) .round() - .mul_add(F32x::splat(2.), ONE); + .mla(F32x::splat(2.), ONE); q = dq.roundi(); s = d.add_as_doubled(dq * (-PI_A2_F) * HALF); s += dq * (-PI_B2_F) * HALF; @@ -171,8 +171,8 @@ macro_rules! impl_math_f32_u10 { s = s.square(); let u = F32x::splat(2.608_315_980_978_659_354_150_3_e-6) - .mul_add(s.0, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) - .mul_add(s.0, F32x::splat(0.008_333_078_585_565_090_179_443_36)); + .mla(s.0, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) + .mla(s.0, F32x::splat(0.008_333_078_585_565_090_179_443_36)); let x = ONE.add_checked( F32x::splat(-0.166_666_597_127_914_428_710_938).add_checked_as_doubled(u * s.0) * s, @@ -194,9 +194,9 @@ macro_rules! impl_math_f32_u10 { /// /// NOTE: This version is slower, but SIMD lanes are independent pub fn cosf_deterministic(d: F32x) -> F32x { - let dq = (d.mul_add(FRAC_1_PI, F32x::splat(-0.5))) + let dq = (d.mla(FRAC_1_PI, F32x::splat(-0.5))) .round() - .mul_add(F32x::splat(2.), ONE); + .mla(F32x::splat(2.), ONE); let mut q = dq.roundi(); let mut s = d.add_as_doubled(dq * (-PI_A2_F * HALF)); s += dq * (-PI_B2_F * HALF); @@ -230,8 +230,8 @@ macro_rules! impl_math_f32_u10 { s = s.square(); let u = F32x::splat(2.608_315_980_978_659_354_150_3_e-6) - .mul_add(s.0, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) - .mul_add(s.0, F32x::splat(0.008_333_078_585_565_090_179_443_36)); + .mla(s.0, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) + .mla(s.0, F32x::splat(0.008_333_078_585_565_090_179_443_36)); let x = ONE.add_checked( F32x::splat(-0.166_666_597_127_914_428_710_938).add_checked_as_doubled(u * s.0) * s, @@ -276,7 +276,7 @@ macro_rules! impl_math_f32_u10 { if d.abs().simd_lt(TRIGRANGEMAX2_F).all() { let u = (d * FRAC_2_PI).round(); q = u.roundi(); - let v = u.mul_add(-PI_A2_F * HALF, d); + let v = u.mla(-PI_A2_F * HALF, d); s = v.add_as_doubled(u * (-PI_B2_F) * HALF); s = s.add_checked(u * (-PI_C2_F) * HALF); } else { @@ -292,8 +292,8 @@ macro_rules! impl_math_f32_u10 { s.0 = s.square_as_f(); let u = F32x::splat(-0.000_195_169_282_960_705_459_117_889) - .mul_add(s.0, F32x::splat(0.008_332_157_507_538_795_471_191_41)) - .mul_add(s.0, F32x::splat(-0.166_666_537_523_269_653_320_312)) + .mla(s.0, F32x::splat(0.008_332_157_507_538_795_471_191_41)) + .mla(s.0, F32x::splat(-0.166_666_537_523_269_653_320_312)) * (s.0 * t.0); let x = t.add_checked(u); @@ -302,10 +302,10 @@ macro_rules! impl_math_f32_u10 { let rx = d.is_neg_zero().select(NEG_ZERO, rx); let u = F32x::splat(-2.718_118_423_672_422_068_193_55_e-7) - .mul_add(s.0, F32x::splat(2.479_904_469_510_074_704_885_48_e-5)) - .mul_add(s.0, F32x::splat(-0.001_388_887_874_782_085_418_701_17)) - .mul_add(s.0, F32x::splat(0.041_666_664_183_139_801_025_390_6)) - .mul_add(s.0, F32x::splat(-0.5)); + .mla(s.0, F32x::splat(2.479_904_469_510_074_704_885_48_e-5)) + .mla(s.0, F32x::splat(-0.001_388_887_874_782_085_418_701_17)) + .mla(s.0, F32x::splat(0.041_666_664_183_139_801_025_390_6)) + .mla(s.0, F32x::splat(-0.5)); let x = ONE.add_checked(s.0.mul_as_doubled(u)); let ry = F32x::from(x); @@ -339,7 +339,7 @@ macro_rules! impl_math_f32_u10 { pub fn sincosf_deterministic(d: F32x) -> (F32x, F32x) { let u = (d * FRAC_2_PI).round(); let mut q = u.roundi(); - let v = u.mul_add(-PI_A2_F * HALF, d); + let v = u.mla(-PI_A2_F * HALF, d); let mut s = v.add_as_doubled(u * (-PI_B2_F * HALF)); s = s.add_checked(u * (-PI_C2_F * HALF)); let g = d.abs().simd_lt(TRIGRANGEMAX2_F); @@ -358,8 +358,8 @@ macro_rules! impl_math_f32_u10 { s.0 = s.square_as_f(); let u = F32x::splat(-0.000_195_169_282_960_705_459_117_889) - .mul_add(s.0, F32x::splat(0.008_332_157_507_538_795_471_191_41)) - .mul_add(s.0, F32x::splat(-0.166_666_537_523_269_653_320_312)) + .mla(s.0, F32x::splat(0.008_332_157_507_538_795_471_191_41)) + .mla(s.0, F32x::splat(-0.166_666_537_523_269_653_320_312)) * (s.0 * t.0); let x = t.add_checked(u); @@ -368,10 +368,10 @@ macro_rules! impl_math_f32_u10 { rx = d.is_neg_zero().select(NEG_ZERO, rx); let u = F32x::splat(-2.718_118_423_672_422_068_193_55_e-7) - .mul_add(s.0, F32x::splat(2.479_904_469_510_074_704_885_48_e-5)) - .mul_add(s.0, F32x::splat(-0.001_388_887_874_782_085_418_701_17)) - .mul_add(s.0, F32x::splat(0.041_666_664_183_139_801_025_390_6)) - .mul_add(s.0, F32x::splat(-0.5)); + .mla(s.0, F32x::splat(2.479_904_469_510_074_704_885_48_e-5)) + .mla(s.0, F32x::splat(-0.001_388_887_874_782_085_418_701_17)) + .mla(s.0, F32x::splat(0.041_666_664_183_139_801_025_390_6)) + .mla(s.0, F32x::splat(-0.5)); let x = ONE.add_checked(s.0.mul_as_doubled(u)); let ry = F32x::from(x); @@ -425,7 +425,7 @@ macro_rules! impl_math_f32_u10 { let mut s = if d.abs().simd_lt(TRIGRANGEMAX2_F).all() { let u = (d * FRAC_2_PI).round(); q = u.roundi(); - let v = u.mul_add(-PI_A2_F * HALF, d); + let v = u.mla(-PI_A2_F * HALF, d); v.add_as_doubled(u * (-PI_B2_F) * HALF) .add_checked(u * (-PI_C2_F) * HALF) } else { @@ -450,10 +450,10 @@ macro_rules! impl_math_f32_u10 { s = s.normalize(); let u = F32x::splat(0.004_466_364_625_841_379_165_649_41) - .mul_add(s.0, F32x::splat(-8.392_018_207_814_544_439_315_8_e-5)) - .mul_add(s.0, F32x::splat(0.010_963_924_229_145_050_048_828_1)) - .mul_add(s.0, F32x::splat(0.021_236_030_384_898_185_729_980_5)) - .mul_add(s.0, F32x::splat(0.054_068_714_380_264_282_226_562_5)); + .mla(s.0, F32x::splat(-8.392_018_207_814_544_439_315_8_e-5)) + .mla(s.0, F32x::splat(0.010_963_924_229_145_050_048_828_1)) + .mla(s.0, F32x::splat(0.021_236_030_384_898_185_729_980_5)) + .mla(s.0, F32x::splat(0.054_068_714_380_264_282_226_562_5)); let mut x = F32x::splat(0.133_325_666_189_193_725_585_938).add_checked_as_doubled(u * s.0); @@ -477,7 +477,7 @@ macro_rules! impl_math_f32_u10 { pub fn tanf_deterministic(d: F32x) -> F32x { let u = (d * FRAC_2_PI).round(); let mut q = u.roundi(); - let v = u.mul_add(-PI_A2_F * HALF, d); + let v = u.mla(-PI_A2_F * HALF, d); let mut s = v.add_as_doubled(u * (-PI_B2_F * HALF)); s = s.add_checked(u * (-PI_C2_F * HALF)); let g = d.abs().simd_lt(TRIGRANGEMAX2_F); @@ -506,10 +506,10 @@ macro_rules! impl_math_f32_u10 { s = s.normalize(); let u = F32x::splat(0.004_466_364_625_841_379_165_649_41) - .mul_add(s.0, F32x::splat(-8.392_018_207_814_544_439_315_8_e-5)) - .mul_add(s.0, F32x::splat(0.010_963_924_229_145_050_048_828_1)) - .mul_add(s.0, F32x::splat(0.021_236_030_384_898_185_729_980_5)) - .mul_add(s.0, F32x::splat(0.054_068_714_380_264_282_226_562_5)); + .mla(s.0, F32x::splat(-8.392_018_207_814_544_439_315_8_e-5)) + .mla(s.0, F32x::splat(0.010_963_924_229_145_050_048_828_1)) + .mla(s.0, F32x::splat(0.021_236_030_384_898_185_729_980_5)) + .mla(s.0, F32x::splat(0.054_068_714_380_264_282_226_562_5)); let mut x = F32x::splat(0.133_325_666_189_193_725_585_938).add_checked_as_doubled(u * s.0); @@ -560,13 +560,13 @@ macro_rules! impl_math_f32_u10 { t = t.normalize(); let u = F32x::splat(-0.001_763_979_089_446_365_833_282_47) - .mul_add(t.0, F32x::splat(0.010_790_090_076_625_347_137_451_2)) - .mul_add(t.0, F32x::splat(-0.030_956_460_162_997_245_788_574_2)) - .mul_add(t.0, F32x::splat(0.057_736_508_548_259_735_107_421_9)) - .mul_add(t.0, F32x::splat(-0.083_895_072_340_965_270_996_093_8)) - .mul_add(t.0, F32x::splat(0.109_463_557_600_975_036_621_094)) - .mul_add(t.0, F32x::splat(-0.142_626_821_994_781_494_140_625)) - .mul_add(t.0, F32x::splat(0.199_983_194_470_405_578_613_281)); + .mla(t.0, F32x::splat(0.010_790_090_076_625_347_137_451_2)) + .mla(t.0, F32x::splat(-0.030_956_460_162_997_245_788_574_2)) + .mla(t.0, F32x::splat(0.057_736_508_548_259_735_107_421_9)) + .mla(t.0, F32x::splat(-0.083_895_072_340_965_270_996_093_8)) + .mla(t.0, F32x::splat(0.109_463_557_600_975_036_621_094)) + .mla(t.0, F32x::splat(-0.142_626_821_994_781_494_140_625)) + .mla(t.0, F32x::splat(0.199_983_194_470_405_578_613_281)); t *= F32x::splat(-0.333_332_866_430_282_592_773_438).add_checked_as_doubled(u * t.0); t = s * ONE.add_checked(t); @@ -632,10 +632,10 @@ macro_rules! impl_math_f32_u10 { x = d.abs().simd_eq(ONE).select_doubled(Doubled::from(ZERO), x); let u = F32x::splat(0.419_745_482_5_e-1) - .mul_add(x2, F32x::splat(0.242_404_602_5_e-1)) - .mul_add(x2, F32x::splat(0.454_742_386_9_e-1)) - .mul_add(x2, F32x::splat(0.749_502_927_1_e-1)) - .mul_add(x2, F32x::splat(0.166_667_729_6)) + .mla(x2, F32x::splat(0.242_404_602_5_e-1)) + .mla(x2, F32x::splat(0.454_742_386_9_e-1)) + .mla(x2, F32x::splat(0.749_502_927_1_e-1)) + .mla(x2, F32x::splat(0.166_667_729_6)) * (x2 * x.0); let y = Doubled::new( @@ -671,10 +671,10 @@ macro_rules! impl_math_f32_u10 { x = d.abs().simd_eq(ONE).select_doubled(Doubled::from(ZERO), x); let u = F32x::splat(0.419_745_482_5_e-1) - .mul_add(x2, F32x::splat(0.242_404_602_5_e-1)) - .mul_add(x2, F32x::splat(0.454_742_386_9_e-1)) - .mul_add(x2, F32x::splat(0.749_502_927_1_e-1)) - .mul_add(x2, F32x::splat(0.166_667_729_6)) + .mla(x2, F32x::splat(0.242_404_602_5_e-1)) + .mla(x2, F32x::splat(0.454_742_386_9_e-1)) + .mla(x2, F32x::splat(0.749_502_927_1_e-1)) + .mla(x2, F32x::splat(0.166_667_729_6)) * (x2 * x.0); let mut y = Doubled::new( @@ -820,9 +820,9 @@ macro_rules! impl_math_f32_u10 { let x2 = x.square(); let t = F32x::splat(0.239_282_846_450_805_664_062_5) - .mul_add(x2.0, F32x::splat(0.285_182_118_415_832_519_531_25)) - .mul_add(x2.0, F32x::splat(0.400_005_877_017_974_853_515_625)) - .mul_add(x2.0, F32x::splat(0.666_666_686_534_881_591_796_875)); + .mla(x2.0, F32x::splat(0.285_182_118_415_832_519_531_25)) + .mla(x2.0, F32x::splat(0.400_005_877_017_974_853_515_625)) + .mla(x2.0, F32x::splat(0.666_666_686_534_881_591_796_875)); let mut s = Doubled::::splat(crate::f32::D_LN2) * e.cast(); s = s.add_checked(x.scale(F32x::splat(2.))); @@ -946,8 +946,8 @@ macro_rules! impl_math_f32_u10 { let x2 = x.0 * x.0; let t = F32x::splat(0.302_729_487_4) - .mul_add(x2, F32x::splat(0.399_610_817_4)) - .mul_add(x2, F32x::splat(0.666_669_488)); + .mla(x2, F32x::splat(0.399_610_817_4)) + .mla(x2, F32x::splat(0.666_669_488)); s = s.add_checked(x.scale(F32x::splat(2.))); s = s.add_checked(x2 * x.0 * t); @@ -1010,8 +1010,8 @@ macro_rules! impl_math_f32_u10 { let x2 = x.0 * x.0; let t = F32x::splat(0.131_428_986_8) - .mul_add(x2, F32x::splat(0.173_549_354_1)) - .mul_add(x2, F32x::splat(0.289_530_962_7)); + .mla(x2, F32x::splat(0.173_549_354_1)) + .mla(x2, F32x::splat(0.289_530_962_7)); s = s.add_checked(x * Doubled::new( F32x::splat(0.868_588_984), @@ -1071,8 +1071,8 @@ macro_rules! impl_math_f32_u10 { let x2 = x.0 * x.0; let t = F32x::splat(0.437_455_028_3) - .mul_add(x2, F32x::splat(0.576_479_017_7)) - .mul_add(x2, F32x::splat(0.961_801_290_512)); + .mla(x2, F32x::splat(0.576_479_017_7)) + .mla(x2, F32x::splat(0.961_801_290_512)); let mut s = ef + x * Doubled::new( F32x::splat(2.885_390_043_258_666_992_2), @@ -1121,14 +1121,14 @@ macro_rules! impl_math_f32_u10 { let dp1 = o.select(dp1 * (F1_32X * F1_32X), dp1); let e = ilogb2kf(dp1 * F32x::splat(1. / 0.75)); let t = ldexp3kf(ONE, -e); - m = d.mul_add(t, t - ONE); + m = d.mla(t, t - ONE); let e = o.select(e - I32x::splat(64), e); Doubled::::splat(crate::f32::D_LN2) * e.cast() }/* else { let e = vgetexp_vf_vf(dp1, F32x::splat(1. / 0.75)); let e = e.simd_eq(INFINITY).select(F32x::splat(128.), e); let t = ldexp3kf(ONE, -e.roundi()); - m = d.mul_add(t, t - ONE); + m = d.mla(t, t - ONE); Doubled::::splat(crate::f32::D_LN2) * e }*/; @@ -1136,8 +1136,8 @@ macro_rules! impl_math_f32_u10 { let x2 = x.0 * x.0; let t = F32x::splat(0.302_729_487_4) - .mul_add(x2, F32x::splat(0.399_610_817_4)) - .mul_add(x2, F32x::splat(0.666_669_488)); + .mla(x2, F32x::splat(0.399_610_817_4)) + .mla(x2, F32x::splat(0.666_669_488)); s = s.add_checked(x.scale(F32x::splat(2.))); s = s.add_checked(x2 * x.0 * t); @@ -1167,17 +1167,17 @@ macro_rules! impl_math_f32_u10 { pub fn expf(d: F32x) -> F32x { let q = (d * R_LN2_F).roundi(); - let s = q.cast::().mul_add(-L2U_F, d); - let s = q.cast::().mul_add(-L2L_F, s); + let s = q.cast::().mla(-L2U_F, d); + let s = q.cast::().mla(-L2L_F, s); let mut u = F32x::splat(0.000_198_527_617_612_853_646_278_381) - .mul_add(s, F32x::splat(0.001_393_043_552_525_341_510_772_71)) - .mul_add(s, F32x::splat(0.008_333_360_776_305_198_669_433_59)) - .mul_add(s, F32x::splat(0.041_666_485_369_205_474_853_515_6)) - .mul_add(s, F32x::splat(0.166_666_671_633_720_397_949_219)) - .mul_add(s, HALF); + .mla(s, F32x::splat(0.001_393_043_552_525_341_510_772_71)) + .mla(s, F32x::splat(0.008_333_360_776_305_198_669_433_59)) + .mla(s, F32x::splat(0.041_666_485_369_205_474_853_515_6)) + .mla(s, F32x::splat(0.166_666_671_633_720_397_949_219)) + .mla(s, HALF); - u = ONE + (s * s).mul_add(u, s); + u = ONE + (s * s).mla(u, s); u = ldexp2kf(u, q); @@ -1203,15 +1203,15 @@ macro_rules! impl_math_f32_u10 { let mut u = (d * LOG10_2_F).round(); let q = u.roundi(); - let s = u.mul_add(-L10U_F, d); - let s = u.mul_add(-L10L_F, s); + let s = u.mla(-L10U_F, d); + let s = u.mla(-L10L_F, s); u = F32x::splat(0.680_255_591_9_e-1) - .mul_add(s, F32x::splat(0.207_808_032_6)) - .mul_add(s, F32x::splat(0.539_390_385_2)) - .mul_add(s, F32x::splat(0.117_124_533_7_e+1)) - .mul_add(s, F32x::splat(0.203_467_869_8_e+1)) - .mul_add(s, F32x::splat(0.265_094_900_1_e+1)); + .mla(s, F32x::splat(0.207_808_032_6)) + .mla(s, F32x::splat(0.539_390_385_2)) + .mla(s, F32x::splat(0.117_124_533_7_e+1)) + .mla(s, F32x::splat(0.203_467_869_8_e+1)) + .mla(s, F32x::splat(0.265_094_900_1_e+1)); let x = Doubled::new( F32x::splat(2.302_585_124_969_482_421_9), F32x::splat(-3.170_517_251_649_359_315_7_e-08) @@ -1273,14 +1273,14 @@ macro_rules! impl_math_f32_u10 { let s = d - u; u = F32x::splat(0.153_592_089_2_e-3) - .mul_add(s, F32x::splat(0.133_926_270_1_e-2)) - .mul_add(s, F32x::splat(0.961_838_476_4_e-2)) - .mul_add(s, F32x::splat(0.555_034_726_9_e-1)) - .mul_add(s, F32x::splat(0.240_226_447_6)) - .mul_add(s, F32x::splat(0.693_147_182_5)); + .mla(s, F32x::splat(0.133_926_270_1_e-2)) + .mla(s, F32x::splat(0.961_838_476_4_e-2)) + .mla(s, F32x::splat(0.555_034_726_9_e-1)) + .mla(s, F32x::splat(0.240_226_447_6)) + .mla(s, F32x::splat(0.693_147_182_5)); if cfg!(target_feature = "fma") { - u = u.mul_add(s, ONE); + u = u.mla(s, ONE); } else { u = ONE.add_checked(u.mul_as_doubled(s)).normalize().0; } @@ -1324,8 +1324,8 @@ macro_rules! impl_math_f32_u10 { let x2 = x.square(); let t = F32x::splat(0.240_320_354_700_088_500_976_562) - .mul_add(x2.0, F32x::splat(0.285_112_679_004_669_189_453_125)) - .mul_add(x2.0, F32x::splat(0.400_007_992_982_864_379_882_812)); + .mla(x2.0, F32x::splat(0.285_112_679_004_669_189_453_125)) + .mla(x2.0, F32x::splat(0.400_007_992_982_864_379_882_812)); let c = Doubled::new( F32x::splat(0.666_666_626_930_236_816_406_25), F32x::splat(3.691_838_612_596_143_320_843_11_e-9), @@ -1348,10 +1348,10 @@ macro_rules! impl_math_f32_u10 { s = s.normalize(); let mut u = F32x::splat(0.001_363_246_468_827_128_410_339_36) - .mul_add(s.0, F32x::splat(0.008_365_969_173_610_210_418_701_17)) - .mul_add(s.0, F32x::splat(0.041_671_082_377_433_776_855_468_8)) - .mul_add(s.0, F32x::splat(0.166_665_524_244_308_471_679_688)) - .mul_add(s.0, F32x::splat(0.499_999_850_988_388_061_523_438)); + .mla(s.0, F32x::splat(0.008_365_969_173_610_210_418_701_17)) + .mla(s.0, F32x::splat(0.041_671_082_377_433_776_855_468_8)) + .mla(s.0, F32x::splat(0.166_665_524_244_308_471_679_688)) + .mla(s.0, F32x::splat(0.499_999_850_988_388_061_523_438)); let mut t = s.add_checked(s.square() * u); @@ -1464,11 +1464,11 @@ macro_rules! impl_math_f32_u10 { d = d.abs(); let mut x = F32x::splat(-0.601_564_466_953_277_587_890_625) - .mul_add(d, F32x::splat(2.820_889_234_542_846_679_687_5)) - .mul_add(d, F32x::splat(-5.532_182_216_644_287_109_375)) - .mul_add(d, F32x::splat(5.898_262_500_762_939_453_125)) - .mul_add(d, F32x::splat(-3.809_541_702_270_507_812_5)) - .mul_add(d, F32x::splat(2.224_125_623_703_002_929_687_5)); + .mla(d, F32x::splat(2.820_889_234_542_846_679_687_5)) + .mla(d, F32x::splat(-5.532_182_216_644_287_109_375)) + .mla(d, F32x::splat(5.898_262_500_762_939_453_125)) + .mla(d, F32x::splat(-3.809_541_702_270_507_812_5)) + .mla(d, F32x::splat(2.224_125_623_703_002_929_687_5)); let mut y = x * x; y = y * y; @@ -1543,7 +1543,7 @@ macro_rules! impl_math_f32_u10 { 0.943_515_777_6, 0.110_248_955_e-3, ) - .mul_add( + .mla( t, F32x::select3( o2, @@ -1553,7 +1553,7 @@ macro_rules! impl_math_f32_u10 { 0.816_001_993_4_e-4, ), ) - .mul_add( + .mla( t, F32x::select3( o2, @@ -1563,7 +1563,7 @@ macro_rules! impl_math_f32_u10 { 0.152_846_885_6_e-3, ), ) - .mul_add( + .mla( t, F32x::select3( o2, @@ -1573,7 +1573,7 @@ macro_rules! impl_math_f32_u10 { -0.235_506_871_8_e-3, ), ) - .mul_add( + .mla( t, F32x::select3( o2, @@ -1583,7 +1583,7 @@ macro_rules! impl_math_f32_u10 { 0.496_224_209_2_e-3, ), ) - .mul_add( + .mla( t, F32x::select3( o2, @@ -1593,7 +1593,7 @@ macro_rules! impl_math_f32_u10 { -0.119_348_801_7_e-2, ), ) - .mul_add( + .mla( t, F32x::select3( o2, @@ -1603,7 +1603,7 @@ macro_rules! impl_math_f32_u10 { 0.289_159_943_3_e-2, ), ) - .mul_add( + .mla( t, F32x::select3( o2, @@ -1613,7 +1613,7 @@ macro_rules! impl_math_f32_u10 { -0.738_545_181_2_e-2, ), ) - .mul_add( + .mla( t, F32x::select3( o2, diff --git a/src/f32x/u15_impl.rs b/src/f32x/u15_impl.rs index bbb0051..8f1a22e 100644 --- a/src/f32x/u15_impl.rs +++ b/src/f32x/u15_impl.rs @@ -25,7 +25,7 @@ macro_rules! impl_math_f32_u15 { -0.386_950_403_5, 0.111_534_416_7_e+1, ) - .mul_add( + .mla( u.0, F32x::select4( o0, @@ -37,7 +37,7 @@ macro_rules! impl_math_f32_u15 { -0.945_490_419_9, ), ) - .mul_add( + .mla( u.0, F32x::select4( o0, @@ -49,7 +49,7 @@ macro_rules! impl_math_f32_u15 { -0.366_725_951_4, ), ) - .mul_add( + .mla( u.0, F32x::select4( o0, @@ -61,7 +61,7 @@ macro_rules! impl_math_f32_u15 { 0.715_566_337_1, ), ) - .mul_add( + .mla( u.0, F32x::select4( o0, diff --git a/src/f32x/u35_impl.rs b/src/f32x/u35_impl.rs index 816fd6a..677c83c 100644 --- a/src/f32x/u35_impl.rs +++ b/src/f32x/u35_impl.rs @@ -14,16 +14,16 @@ macro_rules! impl_math_f32_u35 { if d.abs().simd_lt(TRIGRANGEMAX2_F).all() { q = (d * FRAC_1_PI).roundi(); u = q.cast(); - d = u.mul_add(-PI_A2_F, d); - d = u.mul_add(-PI_B2_F, d); - d = u.mul_add(-PI_C2_F, d); + d = u.mla(-PI_A2_F, d); + d = u.mla(-PI_B2_F, d); + d = u.mla(-PI_C2_F, d); } else if d.abs().simd_lt(TRIGRANGEMAX_F).all() { q = (d * FRAC_1_PI).roundi(); u = q.cast(); - d = u.mul_add(-PI_A_F, d); - d = u.mul_add(-PI_B_F, d); - d = u.mul_add(-PI_C_F, d); - d = u.mul_add(-PI_D_F, d); + d = u.mla(-PI_A_F, d); + d = u.mla(-PI_B_F, d); + d = u.mla(-PI_C_F, d); + d = u.mla(-PI_D_F, d); } else { let (mut dfidf, dfii) = rempif(d); q = dfii & I32x::splat(3); @@ -49,9 +49,9 @@ macro_rules! impl_math_f32_u35 { ); let mut u = F32x::splat(2.608_315_980_978_659_354_150_3_e-6) - .mul_add(s, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) - .mul_add(s, F32x::splat(0.008_333_078_585_565_090_179_443_36)) - .mul_add(s, F32x::splat(-0.166_666_597_127_914_428_710_938)); + .mla(s, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) + .mla(s, F32x::splat(0.008_333_078_585_565_090_179_443_36)) + .mla(s, F32x::splat(-0.166_666_597_127_914_428_710_938)); u = s * (u * d) + d; @@ -69,17 +69,17 @@ macro_rules! impl_math_f32_u35 { let mut q = (d * FRAC_1_PI).roundi(); let u: F32x = q.cast(); - d = u.mul_add(-PI_A2_F, d); - d = u.mul_add(-PI_B2_F, d); - d = u.mul_add(-PI_C2_F, d); + d = u.mla(-PI_A2_F, d); + d = u.mla(-PI_B2_F, d); + d = u.mla(-PI_C2_F, d); let g = r.abs().simd_lt(TRIGRANGEMAX2_F); if !g.all() { let s: F32x = q.cast(); - let mut u = s.mul_add(-PI_A_F, r); - u = s.mul_add(-PI_B_F, u); - u = s.mul_add(-PI_C_F, u); - u = s.mul_add(-PI_D_F, u); + let mut u = s.mla(-PI_A_F, r); + u = s.mla(-PI_B_F, u); + u = s.mla(-PI_C_F, u); + u = s.mla(-PI_D_F, u); d = g.select(d, u); let g = r.abs().simd_lt(TRIGRANGEMAX_F); @@ -113,9 +113,9 @@ macro_rules! impl_math_f32_u35 { ); let mut u = F32x::splat(2.608_315_980_978_659_354_150_3_e-6) - .mul_add(s, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) - .mul_add(s, F32x::splat(0.008_333_078_585_565_090_179_443_36)) - .mul_add(s, F32x::splat(-0.166_666_597_127_914_428_710_938)); + .mla(s, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) + .mla(s, F32x::splat(0.008_333_078_585_565_090_179_443_36)) + .mla(s, F32x::splat(-0.166_666_597_127_914_428_710_938)); u = s * (u * d) + d; @@ -151,18 +151,18 @@ macro_rules! impl_math_f32_u35 { q = q + q + I32x::splat(1); let u: F32x = q.cast(); - d = u.mul_add(-PI_A2_F * HALF, d); - d = u.mul_add(-PI_B2_F * HALF, d); - d = u.mul_add(-PI_C2_F * HALF, d); + d = u.mla(-PI_A2_F * HALF, d); + d = u.mla(-PI_B2_F * HALF, d); + d = u.mla(-PI_C2_F * HALF, d); } else if d.abs().simd_lt(TRIGRANGEMAX_F).all() { q = (d * FRAC_1_PI - HALF).roundi(); q = q + q + I32x::splat(1); let u: F32x = q.cast(); - d = u.mul_add(-PI_A_F * HALF, d); - d = u.mul_add(-PI_B_F * HALF, d); - d = u.mul_add(-PI_C_F * HALF, d); - d = u.mul_add(-PI_D_F * HALF, d); + d = u.mla(-PI_A_F * HALF, d); + d = u.mla(-PI_B_F * HALF, d); + d = u.mla(-PI_C_F * HALF, d); + d = u.mla(-PI_D_F * HALF, d); } else { let (mut dfidf, dfii) = rempif(d); q = dfii & I32x::splat(3); @@ -189,9 +189,9 @@ macro_rules! impl_math_f32_u35 { ); let u = F32x::splat(2.608_315_980_978_659_354_150_3_e-6) - .mul_add(s, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) - .mul_add(s, F32x::splat(0.008_333_078_585_565_090_179_443_36)) - .mul_add(s, F32x::splat(-0.166_666_597_127_914_428_710_938)); + .mla(s, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) + .mla(s, F32x::splat(0.008_333_078_585_565_090_179_443_36)) + .mla(s, F32x::splat(-0.166_666_597_127_914_428_710_938)); s * (u * d) + d } @@ -208,17 +208,17 @@ macro_rules! impl_math_f32_u35 { let mut q = (d * FRAC_1_PI - HALF).roundi(); q = q + q + I32x::splat(1); let u: F32x = q.cast(); - d = u.mul_add(-PI_A2_F * HALF, d); - d = u.mul_add(-PI_B2_F * HALF, d); - d = u.mul_add(-PI_C2_F * HALF, d); + d = u.mla(-PI_A2_F * HALF, d); + d = u.mla(-PI_B2_F * HALF, d); + d = u.mla(-PI_C2_F * HALF, d); let g = r.abs().simd_lt(TRIGRANGEMAX2_F); if !g.all() { let s: F32x = q.cast(); - let mut u = s.mul_add(-PI_A_F * HALF, r); - u = s.mul_add(-PI_B_F * HALF, u); - u = s.mul_add(-PI_C_F * HALF, u); - u = s.mul_add(-PI_D_F * HALF, u); + let mut u = s.mla(-PI_A_F * HALF, r); + u = s.mla(-PI_B_F * HALF, u); + u = s.mla(-PI_C_F * HALF, u); + u = s.mla(-PI_D_F * HALF, u); d = g.select(d, u); let g = r.abs().simd_lt(TRIGRANGEMAX_F); @@ -253,9 +253,9 @@ macro_rules! impl_math_f32_u35 { ); let u = F32x::splat(2.608_315_980_978_659_354_150_3_e-6) - .mul_add(s, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) - .mul_add(s, F32x::splat(0.008_333_078_585_565_090_179_443_36)) - .mul_add(s, F32x::splat(-0.166_666_597_127_914_428_710_938)); + .mla(s, F32x::splat(-0.000_198_106_907_191_686_332_225_8)) + .mla(s, F32x::splat(0.008_333_078_585_565_090_179_443_36)) + .mla(s, F32x::splat(-0.166_666_597_127_914_428_710_938)); s * (u * d) + d } @@ -289,16 +289,16 @@ macro_rules! impl_math_f32_u35 { if d.abs().simd_lt(TRIGRANGEMAX2_F).all() { q = (d * FRAC_2_PI).roundi(); let u: F32x = q.cast(); - s = u.mul_add(-PI_A2_F * HALF, s); - s = u.mul_add(-PI_B2_F * HALF, s); - s = u.mul_add(-PI_C2_F * HALF, s); + s = u.mla(-PI_A2_F * HALF, s); + s = u.mla(-PI_B2_F * HALF, s); + s = u.mla(-PI_C2_F * HALF, s); } else if d.abs().simd_lt(TRIGRANGEMAX_F).all() { q = (d * FRAC_2_PI).roundi(); let u: F32x = q.cast(); - s = u.mul_add(-PI_A_F * HALF, s); - s = u.mul_add(-PI_B_F * HALF, s); - s = u.mul_add(-PI_C_F * HALF, s); - s = u.mul_add(-PI_D_F * HALF, s); + s = u.mla(-PI_A_F * HALF, s); + s = u.mla(-PI_B_F * HALF, s); + s = u.mla(-PI_C_F * HALF, s); + s = u.mla(-PI_D_F * HALF, s); } else { let (dfidf, dfii) = rempif(d); q = dfii; @@ -311,19 +311,19 @@ macro_rules! impl_math_f32_u35 { s = s * s; let u = F32x::splat(-0.000_195_169_282_960_705_459_117_889) - .mul_add(s, F32x::splat(0.008_332_157_507_538_795_471_191_41)) - .mul_add(s, F32x::splat(-0.166_666_537_523_269_653_320_312)); + .mla(s, F32x::splat(0.008_332_157_507_538_795_471_191_41)) + .mla(s, F32x::splat(-0.166_666_537_523_269_653_320_312)); - let rx = (u * s).mul_add(t, t); + let rx = (u * s).mla(t, t); let rx = d.is_neg_zero().select(NEG_ZERO, rx); let u = F32x::splat(-2.718_118_423_672_422_068_193_55_e-7) - .mul_add(s, F32x::splat(2.479_904_469_510_074_704_885_48_e-5)) - .mul_add(s, F32x::splat(-0.001_388_887_874_782_085_418_701_17)) - .mul_add(s, F32x::splat(0.041_666_664_183_139_801_025_390_6)) - .mul_add(s, F32x::splat(-0.5)); + .mla(s, F32x::splat(2.479_904_469_510_074_704_885_48_e-5)) + .mla(s, F32x::splat(-0.001_388_887_874_782_085_418_701_17)) + .mla(s, F32x::splat(0.041_666_664_183_139_801_025_390_6)) + .mla(s, F32x::splat(-0.5)); - let ry = s.mul_add(u, ONE); + let ry = s.mla(u, ONE); let o = (q & I32x::splat(1)).simd_eq(I32x::splat(0)); let mut rsin = o.select(rx, ry); @@ -351,18 +351,18 @@ macro_rules! impl_math_f32_u35 { pub fn sincosf_deterministic(d: F32x) -> (F32x, F32x) { let mut q = (d * FRAC_2_PI).roundi(); let u: F32x = q.cast(); - let mut s = u.mul_add(-PI_A2_F * HALF, d); - s = u.mul_add(-PI_B2_F * HALF, s); - s = u.mul_add(-PI_C2_F * HALF, s); + let mut s = u.mla(-PI_A2_F * HALF, d); + s = u.mla(-PI_B2_F * HALF, s); + s = u.mla(-PI_C2_F * HALF, s); let g = d.abs().simd_lt(TRIGRANGEMAX2_F); if !g.all() { let q2 = (d * FRAC_2_PI).roundi(); let u: F32x = q2.cast(); - let mut t = u.mul_add(-PI_A_F * HALF, d); - t = u.mul_add(-PI_B_F * HALF, t); - t = u.mul_add(-PI_C_F * HALF, t); - t = u.mul_add(-PI_D_F * HALF, t); + let mut t = u.mla(-PI_A_F * HALF, d); + t = u.mla(-PI_B_F * HALF, t); + t = u.mla(-PI_C_F * HALF, t); + t = u.mla(-PI_D_F * HALF, t); q = g.select(q, q2); s = g.select(s, t); @@ -383,19 +383,19 @@ macro_rules! impl_math_f32_u35 { s = s * s; let u = F32x::splat(-0.000_195_169_282_960_705_459_117_889) - .mul_add(s, F32x::splat(0.008_332_157_507_538_795_471_191_41)) - .mul_add(s, F32x::splat(-0.166_666_537_523_269_653_320_312)); + .mla(s, F32x::splat(0.008_332_157_507_538_795_471_191_41)) + .mla(s, F32x::splat(-0.166_666_537_523_269_653_320_312)); - let mut rx = (u * s).mul_add(t, t); + let mut rx = (u * s).mla(t, t); rx = d.is_neg_zero().select(NEG_ZERO, rx); let u = F32x::splat(-2.718_118_423_672_422_068_193_55_e-7) - .mul_add(s, F32x::splat(2.479_904_469_510_074_704_885_48_e-5)) - .mul_add(s, F32x::splat(-0.001_388_887_874_782_085_418_701_17)) - .mul_add(s, F32x::splat(0.041_666_664_183_139_801_025_390_6)) - .mul_add(s, F32x::splat(-0.5)); + .mla(s, F32x::splat(2.479_904_469_510_074_704_885_48_e-5)) + .mla(s, F32x::splat(-0.001_388_887_874_782_085_418_701_17)) + .mla(s, F32x::splat(0.041_666_664_183_139_801_025_390_6)) + .mla(s, F32x::splat(-0.5)); - let ry = s.mul_add(u, ONE); + let ry = s.mla(u, ONE); let o = (q & I32x::splat(1)).simd_eq(I32x::splat(0)); let mut rsin = o.select(rx, ry); @@ -446,16 +446,16 @@ macro_rules! impl_math_f32_u35 { if d.abs().simd_lt(TRIGRANGEMAX2_F * HALF).all() { q = (d * FRAC_2_PI).roundi(); let u: F32x = q.cast(); - x = u.mul_add(-PI_A2_F * HALF, x); - x = u.mul_add(-PI_B2_F * HALF, x); - x = u.mul_add(-PI_C2_F * HALF, x); + x = u.mla(-PI_A2_F * HALF, x); + x = u.mla(-PI_B2_F * HALF, x); + x = u.mla(-PI_C2_F * HALF, x); } else if d.abs().simd_lt(TRIGRANGEMAX_F).all() { q = (d * (F32x::splat(2.) * FRAC_1_PI)).roundi(); let u: F32x = q.cast(); - x = u.mul_add(-PI_A_F * HALF, x); - x = u.mul_add(-PI_B_F * HALF, x); - x = u.mul_add(-PI_C_F * HALF, x); - x = u.mul_add(-PI_D_F * HALF, x); + x = u.mla(-PI_A_F * HALF, x); + x = u.mla(-PI_B_F * HALF, x); + x = u.mla(-PI_C_F * HALF, x); + x = u.mla(-PI_D_F * HALF, x); } else { let (dfidf, dfii) = rempif(d); q = dfii; @@ -471,11 +471,11 @@ macro_rules! impl_math_f32_u35 { let mut u = if cfg!(feature = "enable_neon32") { F32x::splat(0.009_272_458_031_773_567_199_707_03) - .mul_add(s, F32x::splat(0.003_319_849_958_643_317_222_595_21)) - .mul_add(s, F32x::splat(0.024_299_807_846_546_173_095_703_1)) - .mul_add(s, F32x::splat(0.053_449_530_154_466_629_028_320_3)) - .mul_add(s, F32x::splat(0.133_383_005_857_467_651_367_188)) - .mul_add(s, F32x::splat(0.333_331_853_151_321_411_132_812)) + .mla(s, F32x::splat(0.003_319_849_958_643_317_222_595_21)) + .mla(s, F32x::splat(0.024_299_807_846_546_173_095_703_1)) + .mla(s, F32x::splat(0.053_449_530_154_466_629_028_320_3)) + .mla(s, F32x::splat(0.133_383_005_857_467_651_367_188)) + .mla(s, F32x::splat(0.333_331_853_151_321_411_132_812)) } else { let s2 = s * s; let s4 = s2 * s2; @@ -489,7 +489,7 @@ macro_rules! impl_math_f32_u35 { 0.333_331_853_151_321_411_132_812) }; - u = s.mul_add(u * x, x); + u = s.mla(u * x, x); o.select(u.recip(), u) } @@ -503,18 +503,18 @@ macro_rules! impl_math_f32_u35 { pub fn tanf_deterministic(d: F32x) -> F32x { let mut q = (d * FRAC_2_PI).roundi(); let u: F32x = q.cast(); - let mut x = u.mul_add(-PI_A2_F * HALF, d); - x = u.mul_add(-PI_B2_F * HALF, x); - x = u.mul_add(-PI_C2_F * HALF, x); + let mut x = u.mla(-PI_A2_F * HALF, d); + x = u.mla(-PI_B2_F * HALF, x); + x = u.mla(-PI_C2_F * HALF, x); let g = d.abs().simd_lt(TRIGRANGEMAX2_F * HALF); if !g.all() { let q2 = (d * FRAC_2_PI).roundi(); let s: F32x = q.cast(); - let mut u = s.mul_add(-PI_A_F * HALF, d); - u = s.mul_add(-PI_B_F * HALF, u); - u = s.mul_add(-PI_C_F * HALF, u); - u = s.mul_add(-PI_D_F * HALF, u); + let mut u = s.mla(-PI_A_F * HALF, d); + u = s.mla(-PI_B_F * HALF, u); + u = s.mla(-PI_C_F * HALF, u); + u = s.mla(-PI_D_F * HALF, u); q = g.select(q, q2); x = g.select(x, u); @@ -537,11 +537,11 @@ macro_rules! impl_math_f32_u35 { let mut u = if cfg!(feature = "enable_neon32") { F32x::splat(0.009_272_458_031_773_567_199_707_03) - .mul_add(s, F32x::splat(0.003_319_849_958_643_317_222_595_21)) - .mul_add(s, F32x::splat(0.024_299_807_846_546_173_095_703_1)) - .mul_add(s, F32x::splat(0.053_449_530_154_466_629_028_320_3)) - .mul_add(s, F32x::splat(0.133_383_005_857_467_651_367_188)) - .mul_add(s, F32x::splat(0.333_331_853_151_321_411_132_812)) + .mla(s, F32x::splat(0.003_319_849_958_643_317_222_595_21)) + .mla(s, F32x::splat(0.024_299_807_846_546_173_095_703_1)) + .mla(s, F32x::splat(0.053_449_530_154_466_629_028_320_3)) + .mla(s, F32x::splat(0.133_383_005_857_467_651_367_188)) + .mla(s, F32x::splat(0.333_331_853_151_321_411_132_812)) } else { let s2 = s * s; let s4 = s2 * s2; @@ -555,7 +555,7 @@ macro_rules! impl_math_f32_u35 { 0.333_331_853_151_321_411_132_812) }; - u = s.mul_add(u * x, x); + u = s.mla(u * x, x); o.select(u.recip(), u) } @@ -595,19 +595,19 @@ macro_rules! impl_math_f32_u35 { // let u = F32x::splat(-0.360_092_526_5_e-4) - .mul_add(s, F32x::splat(0.249_008_811_1_e-2)) - .mul_add(s, F32x::splat(-0.807_455_107_6_e-1)) - .mul_add(s, F32x::splat(0.785_398_185_3)); + .mla(s, F32x::splat(0.249_008_811_1_e-2)) + .mla(s, F32x::splat(-0.807_455_107_6_e-1)) + .mla(s, F32x::splat(0.785_398_185_3)); let rx = u * t; // let u = F32x::splat(0.353_981_522_5_e-5) - .mul_add(s, F32x::splat(-0.325_957_400_5_e-3)) - .mul_add(s, F32x::splat(0.158_543_158_3_e-1)) - .mul_add(s, F32x::splat(-0.308_425_128_5)) - .mul_add(s, ONE); + .mla(s, F32x::splat(-0.325_957_400_5_e-3)) + .mla(s, F32x::splat(0.158_543_158_3_e-1)) + .mla(s, F32x::splat(-0.308_425_128_5)) + .mla(s, ONE); let ry = u; @@ -678,8 +678,8 @@ macro_rules! impl_math_f32_u35 { 0.199_926_957_488_059_997_558_594, -0.333_331_018_686_294_555_664_062); - let t = s.mul_add(t * u, s); - q.cast::().mul_add(FRAC_PI_2, t) + let t = s.mla(t * u, s); + q.cast::().mla(FRAC_PI_2, t) } /// Arc tangent function of two variables @@ -729,13 +729,13 @@ macro_rules! impl_math_f32_u35 { let x = o.select(d.abs(), x2.sqrt()); let u = F32x::splat(0.419_745_482_5_e-1) - .mul_add(x2, F32x::splat(0.242_404_602_5_e-1)) - .mul_add(x2, F32x::splat(0.454_742_386_9_e-1)) - .mul_add(x2, F32x::splat(0.749_502_927_1_e-1)) - .mul_add(x2, F32x::splat(0.166_667_729_6)) - .mul_add(x * x2, x); + .mla(x2, F32x::splat(0.242_404_602_5_e-1)) + .mla(x2, F32x::splat(0.454_742_386_9_e-1)) + .mla(x2, F32x::splat(0.749_502_927_1_e-1)) + .mla(x2, F32x::splat(0.166_667_729_6)) + .mla(x * x2, x); - let r = o.select(u, u.mul_add(F32x::splat(-2.), FRAC_PI_2)); + let r = o.select(u, u.mla(F32x::splat(-2.), FRAC_PI_2)); r.mul_sign(d) } @@ -760,10 +760,10 @@ macro_rules! impl_math_f32_u35 { x = d.abs().simd_eq(ONE).select(ZERO, x); let u = F32x::splat(0.419_745_482_5_e-1) - .mul_add(x2, F32x::splat(0.242_404_602_5_e-1)) - .mul_add(x2, F32x::splat(0.454_742_386_9_e-1)) - .mul_add(x2, F32x::splat(0.749_502_927_1_e-1)) - .mul_add(x2, F32x::splat(0.166_667_729_6)) + .mla(x2, F32x::splat(0.242_404_602_5_e-1)) + .mla(x2, F32x::splat(0.454_742_386_9_e-1)) + .mla(x2, F32x::splat(0.749_502_927_1_e-1)) + .mla(x2, F32x::splat(0.166_667_729_6)) * (x2 * x); let y = F32x::splat(core::f32::consts::FRAC_PI_2) - (x.mul_sign(d) + u.mul_sign(d)); @@ -812,7 +812,7 @@ macro_rules! impl_math_f32_u35 { 0.199_926_957_488_059_997_558_594, -0.333_331_018_686_294_555_664_062); - t = s.mul_add(t * u, s); + t = s.mla(t * u, s); t = (q & I32x::splat(1)) .simd_eq(I32x::splat(1)) @@ -877,7 +877,7 @@ macro_rules! impl_math_f32_u35 { /// or a correct value with `3.5 ULP` error bound is returned. pub fn coshf(x: F32x) -> F32x { let e = u10::expf(x.abs()); - let mut y = HALF.mul_add(e, HALF / e); + let mut y = HALF.mla(e, HALF / e); y = (x.abs().simd_gt(F32x::splat(88.)) | y.is_nan()).select(INFINITY, y); F32x::from_bits(x.is_nan().to_int().cast() | y.to_bits()) @@ -943,12 +943,12 @@ macro_rules! impl_math_f32_u35 { let x2 = x * x; let t = F32x::splat(0.239_282_846_450_805_664_062_5) - .mul_add(x2, F32x::splat(0.285_182_118_415_832_519_531_25)) - .mul_add(x2, F32x::splat(0.400_005_877_017_974_853_515_625)) - .mul_add(x2, F32x::splat(0.666_666_686_534_881_591_796_875)) - .mul_add(x2, F32x::splat(2.)); + .mla(x2, F32x::splat(0.285_182_118_415_832_519_531_25)) + .mla(x2, F32x::splat(0.400_005_877_017_974_853_515_625)) + .mla(x2, F32x::splat(0.666_666_686_534_881_591_796_875)) + .mla(x2, F32x::splat(2.)); - x = x.mul_add(t, F32x::splat(0.693_147_180_559_945_286_226_764) * ef); + x = x.mla(t, F32x::splat(0.693_147_180_559_945_286_226_764) * ef); /*if !cfg!(feature = "enable_avx512f") && !cfg!(feature = "enable_avx512fnofma") {*/ x = d.simd_eq(INFINITY).select(INFINITY, x); x = (d.simd_lt(ZERO) | d.is_nan()).select(NAN, x); @@ -987,18 +987,18 @@ macro_rules! impl_math_f32_u35 { let x2 = x * x; let t = F32x::splat(0.437_408_834_7) - .mul_add(x2, F32x::splat(0.576_484_382_2)) - .mul_add(x2, F32x::splat(0.961_802_423)); + .mla(x2, F32x::splat(0.576_484_382_2)) + .mla(x2, F32x::splat(0.961_802_423)); //if !cfg!(feature = "enable_avx512f") && !cfg!(feature = "enable_avx512fnofma") { - let mut r = (x2 * x).mul_add(t, x.mul_add(F32x::splat(0.288_539_004_3_e+1), e.cast())); + let mut r = (x2 * x).mla(t, x.mla(F32x::splat(0.288_539_004_3_e+1), e.cast())); r = d.simd_eq(INFINITY).select(INFINITY, r); r = (d.simd_lt(ZERO) | d.is_nan()).select(NAN, r); d.simd_eq(ZERO).select(NEG_INFINITY, r) /*} else { - let r = (x2 * x).mul_add(t, x.mul_add(F32x::splat(0.288_539_004_3_e+1), e)); + let r = (x2 * x).mla(t, x.mla(F32x::splat(0.288_539_004_3_e+1), e)); vfixup_vf_vf_vf_vi2_i(r, d, I32::splat((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0) */ @@ -1022,16 +1022,16 @@ macro_rules! impl_math_f32_u35 { let mut u = (d * LOG10_2_F).round(); let q = u.roundi(); - let mut s = u.mul_add(-L10U_F, d); - s = u.mul_add(-L10L_F, s); + let mut s = u.mla(-L10U_F, d); + s = u.mla(-L10L_F, s); u = F32x::splat(0.206_400_498_7) - .mul_add(s, F32x::splat(0.541_787_743_6)) - .mul_add(s, F32x::splat(0.117_128_682_1_e+1)) - .mul_add(s, F32x::splat(0.203_465_604_8_e+1)) - .mul_add(s, F32x::splat(0.265_094_876_3_e+1)) - .mul_add(s, F32x::splat(0.230_258_512_5_e+1)) - .mul_add(s, F32x::splat(0.1_e+1)); + .mla(s, F32x::splat(0.541_787_743_6)) + .mla(s, F32x::splat(0.117_128_682_1_e+1)) + .mla(s, F32x::splat(0.203_465_604_8_e+1)) + .mla(s, F32x::splat(0.265_094_876_3_e+1)) + .mla(s, F32x::splat(0.230_258_512_5_e+1)) + .mla(s, F32x::splat(0.1_e+1)); u = ldexp2kf(u, q); @@ -1061,12 +1061,12 @@ macro_rules! impl_math_f32_u35 { let s = d - u; u = F32x::splat(0.153_592_089_2_e-3) - .mul_add(s, F32x::splat(0.133_926_270_1_e-2)) - .mul_add(s, F32x::splat(0.961_838_476_4_e-2)) - .mul_add(s, F32x::splat(0.555_034_726_9_e-1)) - .mul_add(s, F32x::splat(0.240_226_447_6)) - .mul_add(s, F32x::splat(0.693_147_182_5)) - .mul_add(s, F32x::splat(0.1_e+1)); + .mla(s, F32x::splat(0.133_926_270_1_e-2)) + .mla(s, F32x::splat(0.961_838_476_4_e-2)) + .mla(s, F32x::splat(0.555_034_726_9_e-1)) + .mla(s, F32x::splat(0.240_226_447_6)) + .mla(s, F32x::splat(0.693_147_182_5)) + .mla(s, F32x::splat(0.1_e+1)); u = ldexp2kf(u, q); @@ -1167,14 +1167,14 @@ macro_rules! impl_math_f32_u35 { d = d.abs(); let x = F32x::splat(-0.601_564_466_953_277_587_890_625) - .mul_add(d, F32x::splat(2.820_889_234_542_846_679_687_5)) - .mul_add(d, F32x::splat(-5.532_182_216_644_287_109_375)) - .mul_add(d, F32x::splat(5.898_262_500_762_939_453_125)) - .mul_add(d, F32x::splat(-3.809_541_702_270_507_812_5)) - .mul_add(d, F32x::splat(2.224_125_623_703_002_929_687_5)); + .mla(d, F32x::splat(2.820_889_234_542_846_679_687_5)) + .mla(d, F32x::splat(-5.532_182_216_644_287_109_375)) + .mla(d, F32x::splat(5.898_262_500_762_939_453_125)) + .mla(d, F32x::splat(-3.809_541_702_270_507_812_5)) + .mla(d, F32x::splat(2.224_125_623_703_002_929_687_5)); let mut y = d * x * x; - y = (y - F32x::splat(2. / 3.) * y * y.mul_add(x, F32x::splat(-1.))) * q; + y = (y - F32x::splat(2. / 3.) * y * y.mla(x, F32x::splat(-1.))) * q; /*if cfg!(feature = "enable_avx512f") || cfg!(feature = "enable_avx512fnofma") { y = s.is_infinite().select(INFINITY.mul_sign(s), y); @@ -1206,7 +1206,7 @@ macro_rules! impl_math_f32_u35 { let max = x.simd_max(y); let t = min / max; - let mut ret = max * t.mul_add(t, ONE).sqrt(); + let mut ret = max * t.mla(t, ONE).sqrt(); ret = min.simd_eq(ZERO).select(max, ret); ret = (x.is_nan() | y.is_nan()).select(NAN, ret); (x.simd_eq(INFINITY) | y.simd_eq(INFINITY)).select(INFINITY, ret) diff --git a/src/f64.rs b/src/f64.rs index 1d9dfba..67267f8 100644 --- a/src/f64.rs +++ b/src/f64.rs @@ -497,8 +497,12 @@ impl BitsType for f64 { impl MulAdd for f64 { #[inline] - fn mul_add(self, y: Self, z: Self) -> Self { - self * y + z + fn mla(self, y: Self, z: Self) -> Self { + if cfg!(target_feature = "fma") { + self.mul_add(y, z) + } else { + self * y + z + } } } @@ -634,15 +638,15 @@ fn rempisub(x: f64) -> (f64, i32) { let rint4x = if fabsk(4.0 * x) > D1_52 { 4.0 * x } else { - (4.0.mul_add(x, c) - c).or_sign(x) + (4.0.mla(x, c) - c).or_sign(x) }; let rintx = if fabsk(x) > D1_52 { x } else { (x + c - c).or_sign(x) }; - let retd = (-0.25).mul_add(rint4x, x); - let reti = (-4_f64).mul_add(rintx, rint4x) as i32; + let retd = (-0.25).mla(rint4x, x); + let reti = (-4_f64).mla(rintx, rint4x) as i32; (retd, reti) } @@ -693,7 +697,7 @@ fn sinpik(d: f64) -> Doubled { } else { -2.024_611_207_851_823_992_958_68_e-14 }) - .mul_add( + .mla( s, if o { -3.897_962_260_629_327_991_640_47_e-13 @@ -701,7 +705,7 @@ fn sinpik(d: f64) -> Doubled { 6.948_218_305_801_794_613_277_84_e-12 }, ) - .mul_add( + .mla( s, if o { 1.150_115_825_399_960_352_669_01_e-10 @@ -709,7 +713,7 @@ fn sinpik(d: f64) -> Doubled { -1.757_247_499_528_531_799_526_64_e-9 }, ) - .mul_add( + .mla( s, if o { -2.461_136_950_104_469_749_535_9_e-8 @@ -717,7 +721,7 @@ fn sinpik(d: f64) -> Doubled { 3.133_616_889_668_683_928_784_22_e-7 }, ) - .mul_add( + .mla( s, if o { 3.590_860_448_590_527_540_050_62_e-6 @@ -725,7 +729,7 @@ fn sinpik(d: f64) -> Doubled { -3.657_620_418_216_155_192_036_1_e-5 }, ) - .mul_add( + .mla( s, if o { -0.000_325_991_886_927_389_905_997_954 @@ -778,15 +782,15 @@ fn expk2(d: Doubled) -> Doubled { let s = d + qf * (-L2_U) + qf * (-L2_L); let u = 0.160_247_221_970_993_207_2_e-9_f64 - .mul_add(s.0, 0.209_225_518_356_315_700_7_e-8) - .mul_add(s.0, 0.250_523_002_378_264_446_5_e-7) - .mul_add(s.0, 0.275_572_480_090_213_530_3_e-6) - .mul_add(s.0, 0.275_573_189_238_604_437_3_e-5) - .mul_add(s.0, 0.248_015_873_560_581_506_5_e-4) - .mul_add(s.0, 0.198_412_698_414_807_185_8_e-3) - .mul_add(s.0, 0.138_888_888_888_676_325_5_e-2) - .mul_add(s.0, 0.833_333_333_333_334_709_5_e-2) - .mul_add(s.0, 0.416_666_666_666_666_990_5_e-1); + .mla(s.0, 0.209_225_518_356_315_700_7_e-8) + .mla(s.0, 0.250_523_002_378_264_446_5_e-7) + .mla(s.0, 0.275_572_480_090_213_530_3_e-6) + .mla(s.0, 0.275_573_189_238_604_437_3_e-5) + .mla(s.0, 0.248_015_873_560_581_506_5_e-4) + .mla(s.0, 0.198_412_698_414_807_185_8_e-3) + .mla(s.0, 0.138_888_888_888_676_325_5_e-2) + .mla(s.0, 0.833_333_333_333_334_709_5_e-2) + .mla(s.0, 0.416_666_666_666_666_990_5_e-1); let mut t = s * u + 0.166_666_666_666_666_657_4; t = s * t + 0.5; @@ -1104,7 +1108,7 @@ pub fn fmod(x: f64, y: f64) -> f64 { #[inline] fn trunc_positive(x: f64) -> f64 { - let fr = (-D1_31).mul_add((x * (1. / D1_31)) as i32 as f64, x); + let fr = (-D1_31).mla((x * (1. / D1_31)) as i32 as f64, x); if fabsk(x) >= D1_52 { x } else { diff --git a/src/f64/u05.rs b/src/f64/u05.rs index 1bd1d2c..f0b7462 100644 --- a/src/f64/u05.rs +++ b/src/f64/u05.rs @@ -20,11 +20,11 @@ pub fn sincospi(d: f64) -> (f64, f64) { // let u = (-2.024_611_207_851_823_992_958_68_e-14_f64) - .mul_add(s, 6.948_218_305_801_794_613_277_84_e-12) - .mul_add(s, -1.757_247_499_528_531_799_526_64_e-9) - .mul_add(s, 3.133_616_889_668_683_928_784_22_e-7) - .mul_add(s, -3.657_620_418_216_155_192_036_1_e-5) - .mul_add(s, 0.002_490_394_570_192_718_502_743_56); + .mla(s, 6.948_218_305_801_794_613_277_84_e-12) + .mla(s, -1.757_247_499_528_531_799_526_64_e-9) + .mla(s, 3.133_616_889_668_683_928_784_22_e-7) + .mla(s, -3.657_620_418_216_155_192_036_1_e-5) + .mla(s, 0.002_490_394_570_192_718_502_743_56); let mut x = u * s + Doubled::new( -0.080_745_512_188_280_785_248_473_1, @@ -42,11 +42,11 @@ pub fn sincospi(d: f64) -> (f64, f64) { // let u = 9.944_803_876_268_437_740_902_08_e-16_f64 - .mul_add(s, -3.897_962_260_629_327_991_640_47_e-13) - .mul_add(s, 1.150_115_825_399_960_352_669_01_e-10) - .mul_add(s, -2.461_136_950_104_469_749_535_9_e-8) - .mul_add(s, 3.590_860_448_590_527_540_050_62_e-6) - .mul_add(s, -0.000_325_991_886_927_389_905_997_954); + .mla(s, -3.897_962_260_629_327_991_640_47_e-13) + .mla(s, 1.150_115_825_399_960_352_669_01_e-10) + .mla(s, -2.461_136_950_104_469_749_535_9_e-8) + .mla(s, 3.590_860_448_590_527_540_050_62_e-6) + .mla(s, -0.000_325_991_886_927_389_905_997_954); x = u * s + Doubled::new( 0.015_854_344_243_815_501_891_425_9, @@ -155,7 +155,7 @@ fn cospik(d: f64) -> Doubled { } else { -2.024_611_207_851_823_992_958_68_e-14 }) - .mul_add( + .mla( s, if o { -3.897_962_260_629_327_991_640_47_e-13 @@ -163,7 +163,7 @@ fn cospik(d: f64) -> Doubled { 6.948_218_305_801_794_613_277_84_e-12 }, ) - .mul_add( + .mla( s, if o { 1.150_115_825_399_960_352_669_01_e-10 @@ -171,7 +171,7 @@ fn cospik(d: f64) -> Doubled { -1.757_247_499_528_531_799_526_64_e-9 }, ) - .mul_add( + .mla( s, if o { -2.461_136_950_104_469_749_535_9_e-8 @@ -179,7 +179,7 @@ fn cospik(d: f64) -> Doubled { 3.133_616_889_668_683_928_784_22_e-7 }, ) - .mul_add( + .mla( s, if o { 3.590_860_448_590_527_540_050_62_e-6 @@ -187,7 +187,7 @@ fn cospik(d: f64) -> Doubled { -3.657_620_418_216_155_192_036_1_e-5 }, ) - .mul_add( + .mla( s, if o { -0.000_325_991_886_927_389_905_997_954 diff --git a/src/f64/u10.rs b/src/f64/u10.rs index 877f4ec..878b52a 100644 --- a/src/f64/u10.rs +++ b/src/f64/u10.rs @@ -13,13 +13,13 @@ pub fn sin(d: f64) -> f64 { if fabsk(d) < TRIGRANGEMAX2 { let qlf = rintk(d * FRAC_1_PI); ql = qlf as isize; - s = qlf.mul_add(-PI_A2, d).add_checked_as_doubled(qlf * -PI_B2); + s = qlf.mla(-PI_A2, d).add_checked_as_doubled(qlf * -PI_B2); } else if fabsk(d) < TRIGRANGEMAX { let dqh = trunck(d * (FRAC_1_PI / D1_24)) * D1_24; - let qlf = rintk(d.mul_add(FRAC_1_PI, -dqh)); + let qlf = rintk(d.mla(FRAC_1_PI, -dqh)); ql = qlf as isize; - s = dqh.mul_add(-PI_A, d).add_checked_as_doubled(qlf * -PI_A); + s = dqh.mla(-PI_A, d).add_checked_as_doubled(qlf * -PI_A); s += dqh * -PI_B; s += qlf * -PI_B; s += dqh * -PI_C; @@ -58,7 +58,7 @@ pub fn sin(d: f64) -> f64 { 2.755_731_921_044_282_247_773_79_e-6, -0.000_198_412_698_412_046_454_654_947, ) - .mul_add(s.0, 0.008_333_333_333_333_180_562_019_22); + .mla(s.0, 0.008_333_333_333_333_180_562_019_22); let x = (1.).add_checked((-0.166_666_666_666_666_657_414_808).add_checked_as_doubled(u * s.0) * s); @@ -90,7 +90,7 @@ pub fn cos(d: f64) -> f64 { let d = fabsk(d); if d < TRIGRANGEMAX2 { - ql = (2_f64).mul_add(rintk(d * FRAC_1_PI - 0.5), 1.) as isize; + ql = (2_f64).mla(rintk(d * FRAC_1_PI - 0.5), 1.) as isize; let qlf = ql as f64; s = d .add_as_doubled(qlf * (-PI_A2 * 0.5)) @@ -101,7 +101,7 @@ pub fn cos(d: f64) -> f64 { ql = qlf as isize; dqh *= D1_24; - let u = dqh.mul_add(-PI_A * 0.5, d); + let u = dqh.mla(-PI_A * 0.5, d); s = u.add_as_doubled(qlf * (-PI_A * 0.5)); s += dqh * (-PI_B * 0.5); s += qlf * (-PI_B * 0.5); @@ -140,7 +140,7 @@ pub fn cos(d: f64) -> f64 { 2.755_731_921_044_282_247_773_79_e-6, -0.000_198_412_698_412_046_454_654_947, ) - .mul_add(s.0, 0.008_333_333_333_333_180_562_019_22); + .mla(s.0, 0.008_333_333_333_333_180_562_019_22); let x = (1.).add_checked((-0.166_666_666_666_666_657_414_808).add_checked_as_doubled(u * s.0) * s); @@ -174,7 +174,7 @@ pub fn sincos(d: f64) -> (f64, f64) { let qlf = rintk(d * (FRAC_2_PI)); ql = qlf as isize; s = qlf - .mul_add(-PI_A2 * 0.5, d) + .mla(-PI_A2 * 0.5, d) .add_checked_as_doubled(qlf * (-PI_B2 * 0.5)); } else if fabsk(d) < TRIGRANGEMAX { let dqh = trunck(d * ((FRAC_2_PI) / D1_24)) * D1_24; @@ -182,7 +182,7 @@ pub fn sincos(d: f64) -> (f64, f64) { ql = qlf as isize; s = dqh - .mul_add(-PI_A * 0.5, d) + .mla(-PI_A * 0.5, d) .add_checked_as_doubled(qlf * (-PI_A * 0.5)); s += dqh * (-PI_B * 0.5); s += qlf * (-PI_B * 0.5); @@ -202,11 +202,11 @@ pub fn sincos(d: f64) -> (f64, f64) { s.0 = s.square_as_f(); let u = 1.589_383_072_832_289_373_285_11_e-10_f64 - .mul_add(s.0, -2.505_069_435_025_397_733_493_18_e-8) - .mul_add(s.0, 2.755_731_317_768_463_605_125_47_e-6) - .mul_add(s.0, -0.000_198_412_698_278_911_770_864_914) - .mul_add(s.0, 0.008_333_333_333_319_184_596_174_6) - .mul_add(s.0, -0.166_666_666_666_666_130_709_393) + .mla(s.0, -2.505_069_435_025_397_733_493_18_e-8) + .mla(s.0, 2.755_731_317_768_463_605_125_47_e-6) + .mla(s.0, -0.000_198_412_698_278_911_770_864_914) + .mla(s.0, 0.008_333_333_333_319_184_596_174_6) + .mla(s.0, -0.166_666_666_666_666_130_709_393) * s.0 * t.0; @@ -214,12 +214,12 @@ pub fn sincos(d: f64) -> (f64, f64) { let mut rsin = if d.is_neg_zero() { -0. } else { f64::from(x) }; let u = (-1.136_153_502_390_974_295_315_23_e-11_f64) - .mul_add(s.0, 2.087_574_712_070_400_554_793_66_e-9) - .mul_add(s.0, -2.755_731_440_288_475_674_985_67_e-7) - .mul_add(s.0, 2.480_158_728_900_018_673_119_15_e-5) - .mul_add(s.0, -0.001_388_888_888_887_140_192_823_29) - .mul_add(s.0, 0.041_666_666_666_666_551_959_206_2) - .mul_add(s.0, -0.5); + .mla(s.0, 2.087_574_712_070_400_554_793_66_e-9) + .mla(s.0, -2.755_731_440_288_475_674_985_67_e-7) + .mla(s.0, 2.480_158_728_900_018_673_119_15_e-5) + .mla(s.0, -0.001_388_888_888_887_140_192_823_29) + .mla(s.0, 0.041_666_666_666_666_551_959_206_2) + .mla(s.0, -0.5); let x = (1.).add_checked(s.0.mul_as_doubled(u)); let mut rcos = f64::from(x); @@ -262,7 +262,7 @@ pub fn tan(d: f64) -> f64 { let qlf = rintk(d * (2. * FRAC_1_PI)); ql = qlf as isize; s = qlf - .mul_add(-PI_A2 * 0.5, d) + .mla(-PI_A2 * 0.5, d) .add_checked_as_doubled(qlf * (-PI_B2 * 0.5)); } else if fabsk(d) < TRIGRANGEMAX { let dqh = trunck(d * (FRAC_2_PI / D1_24)) * D1_24; @@ -272,7 +272,7 @@ pub fn tan(d: f64) -> f64 { let qlf = ql as f64; s = dqh - .mul_add(-PI_A * 0.5, d) + .mla(-PI_A * 0.5, d) .add_checked_as_doubled(qlf * (-PI_A * 0.5)); s += dqh * (-PI_B * 0.5); s += qlf * (-PI_B * 0.5); @@ -306,7 +306,7 @@ pub fn tan(d: f64) -> f64 { 0.539_682_539_951_727_297_e-1, 0.133_333_333_333_050_058_1, ) - .mul_add(s.0, 0.333_333_333_333_334_369_5); + .mla(s.0, 0.333_333_333_333_334_369_5); let mut x = t.add_checked(s * t * u); @@ -378,10 +378,10 @@ fn atan2k_u1(mut y: Doubled, mut x: Doubled) -> Doubled { 0.076_922_533_029_620_376_865_409_5, -0.090_909_044_277_338_757_478_190_7, ) - .mul_add(t.0, 0.111_111_108_376_896_236_538_123) - .mul_add(t.0, -0.142_857_142_756_268_568_062_339) - .mul_add(t.0, 0.199_999_999_997_977_351_284_817) - .mul_add(t.0, -0.333_333_333_333_317_605_173_818); + .mla(t.0, 0.111_111_108_376_896_236_538_123) + .mla(t.0, -0.142_857_142_756_268_568_062_339) + .mla(t.0, 0.199_999_999_997_977_351_284_817) + .mla(t.0, -0.333_333_333_333_317_605_173_818); t *= u; t = s * (1.).add_checked(t); @@ -677,7 +677,7 @@ fn logk2(d: Doubled) -> Doubled { 0.285_714_285_511_134_091_777_308, 0.400_000_000_000_914_013_309_483, ) - .mul_add(x2.0, 0.666_666_666_666_664_853_302_393); + .mla(x2.0, 0.666_666_666_666_664_853_302_393); (D_LN2 * (e as f64)) + x.scale(2.) + x2 * x * t } @@ -968,7 +968,7 @@ pub fn log1p(d: f64) -> f64 { let mut e = ilogb2k(dp1 * (1. / 0.75)); let t = ldexp3k(1., -e); - let m = d.mul_add(t, t - 1.); + let m = d.mla(t, t - 1.); if o { e -= 64; @@ -1023,8 +1023,8 @@ pub fn exp(d: f64) -> f64 { let qf = rintk(d * R_LN2); let q = qf as i32; - let s = qf.mul_add(-L2_U, d); - let s = qf.mul_add(-L2_L, s); + let s = qf.mla(-L2_U, d); + let s = qf.mla(-L2_L, s); let s2 = s * s; let s4 = s2 * s2; @@ -1046,7 +1046,7 @@ pub fn exp(d: f64) -> f64 { 0.041_666_666_666_666_504_759_142_2, 0.166_666_666_666_666_851_703_837, ) - .mul_add(s, 0.5); + .mla(s, 0.5); u = s * s * u + s + 1.; @@ -1071,20 +1071,20 @@ fn test_exp() { pub fn exp10(d: f64) -> f64 { let q = rintk(d * LOG10_2) as i32; let qf = q as f64; - let s = qf.mul_add(-L10_U, d); - let s = qf.mul_add(-L10_L, s); + let s = qf.mla(-L10_U, d); + let s = qf.mla(-L10_L, s); let mut u = 0.241_146_349_833_426_765_2_e-3_f64 - .mul_add(s, 0.115_748_841_521_718_737_5_e-2) - .mul_add(s, 0.501_397_554_678_973_365_9_e-2) - .mul_add(s, 0.195_976_232_072_053_308_e-1) - .mul_add(s, 0.680_893_639_944_678_413_8_e-1) - .mul_add(s, 0.206_995_849_472_267_623_4) - .mul_add(s, 0.539_382_929_205_853_622_9) - .mul_add(s, 0.117_125_514_890_854_165_5_e+1) - .mul_add(s, 0.203_467_859_229_343_295_3_e+1) - .mul_add(s, 0.265_094_905_523_920_587_6_e+1) - .mul_add(s, 0.230_258_509_299_404_590_1_e+1); + .mla(s, 0.115_748_841_521_718_737_5_e-2) + .mla(s, 0.501_397_554_678_973_365_9_e-2) + .mla(s, 0.195_976_232_072_053_308_e-1) + .mla(s, 0.680_893_639_944_678_413_8_e-1) + .mla(s, 0.206_995_849_472_267_623_4) + .mla(s, 0.539_382_929_205_853_622_9) + .mla(s, 0.117_125_514_890_854_165_5_e+1) + .mla(s, 0.203_467_859_229_343_295_3_e+1) + .mla(s, 0.265_094_905_523_920_587_6_e+1) + .mla(s, 0.230_258_509_299_404_590_1_e+1); u = (1.).add_checked(u.mul_as_doubled(s)).normalize().0; if d > 308.254_715_559_916_71 { @@ -1152,7 +1152,7 @@ pub fn exp2(d: f64) -> f64 { 0.555_041_086_648_204_659_6_e-1, 0.240_226_506_959_101_221_4, ) - .mul_add(s, 0.693_147_180_559_945_286_2); + .mla(s, 0.693_147_180_559_945_286_2); u = (1.).add_checked(u.mul_as_doubled(s)).normalize().0; @@ -1358,11 +1358,11 @@ pub fn cbrt(d: f64) -> f64 { let d = fabsk(d); let mut x = (-0.640_245_898_480_692_909_870_982_f64) - .mul_add(d, 2.961_551_030_200_395_118_185_95) - .mul_add(d, -5.733_530_609_229_478_436_361_66) - .mul_add(d, 6.039_903_689_894_587_479_614_07) - .mul_add(d, -3.858_419_355_104_449_888_216_32) - .mul_add(d, 2.230_727_530_249_660_972_572_2); + .mla(d, 2.961_551_030_200_395_118_185_95) + .mla(d, -5.733_530_609_229_478_436_361_66) + .mla(d, 6.039_903_689_894_587_479_614_07) + .mla(d, -3.858_419_355_104_449_888_216_32) + .mla(d, 2.230_727_530_249_660_972_572_2); let mut y = x * x; y = y * y; @@ -1430,7 +1430,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { } else { 0.707_481_600_086_460_927_9_e-7 }) - .mul_add( + .mla( t, if o2 { 1.120_804_464_289_911_606_838_558_160_000 @@ -1440,7 +1440,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { 0.400_924_433_300_873_044_3_e-6 }, ) - .mul_add( + .mla( t, if o2 { 13.397_985_455_142_589_218_333_060_200_00 @@ -1450,7 +1450,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { 0.104_011_464_162_824_694_6_e-5 }, ) - .mul_add( + .mla( t, if o2 { -0.116_546_276_599_463_200_848_033_357_000 @@ -1460,7 +1460,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { 0.150_834_915_073_332_916_7_e-5 }, ) - .mul_add( + .mla( t, if o2 { -1.391_801_093_265_337_481_495_562_410_000 @@ -1470,7 +1470,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { 0.128_814_307_493_390_102_e-5 }, ) - .mul_add( + .mla( t, if o2 { 0.015_056_113_040_026_424_412_918_973_400 @@ -1480,7 +1480,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { 0.474_416_774_988_499_393_7_e-6 }, ) - .mul_add( + .mla( t, if o2 { 0.179_540_117_061_234_856_098_844_714_000 @@ -1490,7 +1490,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { -0.655_481_630_654_248_990_2_e-7 }, ) - .mul_add( + .mla( t, if o2 { -0.002_481_743_600_264_997_730_942_489_280 @@ -1500,7 +1500,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { -0.318_925_247_145_259_984_4_e-6 }, ) - .mul_add( + .mla( t, if o2 { -0.029_527_880_945_699_120_504_851_034_100 @@ -1510,7 +1510,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { 0.135_888_382_147_035_537_7_e-6 }, ) - .mul_add( + .mla( t, if o2 { 0.000_540_164_767_892_604_515_196_325_186 @@ -1520,7 +1520,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { -0.434_393_127_715_733_604_e-6 }, ) - .mul_add( + .mla( t, if o2 { 0.006_403_362_833_808_069_794_787_256_200 @@ -1530,7 +1530,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { 0.972_478_589_740_677_955_5_e-6 }, ) - .mul_add( + .mla( t, if o2 { -0.000_162_516_262_783_915_816_896_611_252 @@ -1540,7 +1540,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { -0.203_688_605_722_596_601_1_e-5 }, ) - .mul_add( + .mla( t, if o2 { -0.001_914_438_498_565_477_526_465_972_390 @@ -1550,7 +1550,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { 0.437_336_314_181_972_581_5_e-5 }, ) - .mul_add( + .mla( t, if o2 { 7.204_895_416_020_010_558_983_115_17_e-5 @@ -1560,7 +1560,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { -0.943_995_126_830_400_867_7_e-5 }, ) - .mul_add( + .mla( t, if o2 { 0.000_839_498_720_672_087_279_971_000_786 @@ -1570,7 +1570,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { 0.205_072_703_037_638_980_4_e-4 }, ) - .mul_add( + .mla( t, if o2 { -5.171_790_908_260_592_193_293_944_22_e-5 @@ -1580,7 +1580,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { -0.449_262_018_343_118_401_8_e-4 }, ) - .mul_add( + .mla( t, if o2 { -0.000_592_166_437_353_693_882_857_342_347 @@ -1590,7 +1590,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { 0.994_575_123_607_187_593_1_e-4 }, ) - .mul_add( + .mla( t, if o2 { 6.972_813_758_365_857_774_037_435_39_e-5 @@ -1600,7 +1600,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { -0.223_154_759_903_498_319_6_e-3 }, ) - .mul_add( + .mla( t, if o2 { 0.000_784_039_221_720_066_627_493_314_301 @@ -1610,7 +1610,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { 0.509_669_524_710_196_762_2_e-3 }, ) - .mul_add( + .mla( t, if o2 { -0.000_229_472_093_621_399_176_949_318_732 @@ -1620,7 +1620,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { -0.119_275_391_166_788_697_1_e-2 }, ) - .mul_add( + .mla( t, if o2 { -0.002_681_327_160_493_827_160_473_958_490 @@ -1630,7 +1630,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { 0.289_051_033_074_221_031_e-2 }, ) - .mul_add( + .mla( t, if o2 { 0.003_472_222_222_222_222_222_175_164_840 @@ -1640,7 +1640,7 @@ fn gammak(a: f64) -> (Doubled, Doubled) { -0.738_555_102_867_446_185_8_e-2 }, ) - .mul_add( + .mla( t, if o2 { 0.083_333_333_333_333_333_335_592_087_900 diff --git a/src/f64/u15.rs b/src/f64/u15.rs index 60cfb66..e15862c 100644 --- a/src/f64/u15.rs +++ b/src/f64/u15.rs @@ -30,7 +30,7 @@ pub fn erfc(a: f64) -> f64 { } else { 0.233_424_972_963_870_131_9_e+5 }) - .mul_add( + .mla( u.0, if o0 { -0.216_176_624_757_005_566_9_e-18 @@ -42,7 +42,7 @@ pub fn erfc(a: f64) -> f64 { -0.469_566_104_493_310_776_9_e+5 }, ) - .mul_add( + .mla( u.0, if o0 { 0.469_591_917_330_159_567_e-17 @@ -54,7 +54,7 @@ pub fn erfc(a: f64) -> f64 { 0.317_340_310_874_864_335_3_e+5 }, ) - .mul_add( + .mla( u.0, if o0 { -0.904_914_041_988_800_712_2_e-16 @@ -66,7 +66,7 @@ pub fn erfc(a: f64) -> f64 { 0.324_298_278_695_957_378_7_e+4 }, ) - .mul_add( + .mla( u.0, if o0 { 0.163_401_890_355_741_072_8_e-14 @@ -78,7 +78,7 @@ pub fn erfc(a: f64) -> f64 { -0.201_471_799_976_034_781_1_e+5 }, ) - .mul_add( + .mla( u.0, if o0 { -0.278_348_578_633_345_174_5_e-13 @@ -90,7 +90,7 @@ pub fn erfc(a: f64) -> f64 { 0.155_400_697_096_711_828_6_e+5 }, ) - .mul_add( + .mla( u.0, if o0 { 0.446_322_127_678_641_575_2_e-12 @@ -102,7 +102,7 @@ pub fn erfc(a: f64) -> f64 { -0.615_087_419_056_355_429_3_e+4 }, ) - .mul_add( + .mla( u.0, if o0 { -0.671_136_662_285_013_656_3_e-11 @@ -114,7 +114,7 @@ pub fn erfc(a: f64) -> f64 { 0.124_004_776_563_481_573_2_e+4 }, ) - .mul_add( + .mla( u.0, if o0 { 0.942_275_905_023_266_222_3_e-10 @@ -126,7 +126,7 @@ pub fn erfc(a: f64) -> f64 { -0.821_032_547_575_269_973_1_e+2 }, ) - .mul_add( + .mla( u.0, if o0 { -0.122_905_553_010_022_909_8_e-8 @@ -138,7 +138,7 @@ pub fn erfc(a: f64) -> f64 { 0.324_244_388_083_993_087_e+2 }, ) - .mul_add( + .mla( u.0, if o0 { 0.148_071_928_158_508_651_2_e-7 @@ -150,7 +150,7 @@ pub fn erfc(a: f64) -> f64 { -0.292_341_886_383_316_058_6_e+2 }, ) - .mul_add( + .mla( u.0, if o0 { -0.163_658_446_912_339_980_3_e-6 @@ -162,7 +162,7 @@ pub fn erfc(a: f64) -> f64 { 0.345_746_173_281_438_307_1 }, ) - .mul_add( + .mla( u.0, if o0 { 0.164_621_143_658_892_357_5_e-5 @@ -174,7 +174,7 @@ pub fn erfc(a: f64) -> f64 { 0.548_973_015_595_239_299_8_e+1 }, ) - .mul_add( + .mla( u.0, if o0 { -0.149_256_503_584_062_351_1_e-4 @@ -186,7 +186,7 @@ pub fn erfc(a: f64) -> f64 { 0.155_993_413_225_129_413_4_e-2 }, ) - .mul_add( + .mla( u.0, if o0 { 0.120_553_329_817_896_785_1_e-3 @@ -198,7 +198,7 @@ pub fn erfc(a: f64) -> f64 { -0.154_174_156_683_152_063_8_e+1 }, ) - .mul_add( + .mla( u.0, if o0 { -0.854_832_702_345_085_008_1_e-3 @@ -210,7 +210,7 @@ pub fn erfc(a: f64) -> f64 { 0.282_315_223_055_836_418_6_e-5 }, ) - .mul_add( + .mla( u.0, if o0 { 0.522_397_762_544_218_793_2_e-2 @@ -222,7 +222,7 @@ pub fn erfc(a: f64) -> f64 { 0.624_999_918_419_534_283_8 }, ) - .mul_add( + .mla( u.0, if o0 { -0.268_661_706_451_312_522_2_e-1 diff --git a/src/f64/u35.rs b/src/f64/u35.rs index 4f8f88a..e6a230a 100644 --- a/src/f64/u35.rs +++ b/src/f64/u35.rs @@ -12,20 +12,20 @@ pub fn sin(mut d: f64) -> f64 { if fabsk(d) < TRIGRANGEMAX2 { let qlf = rintk(d * FRAC_1_PI); ql = qlf as isize; - d = qlf.mul_add(-PI_A2, d); - d = qlf.mul_add(-PI_B2, d); + d = qlf.mla(-PI_A2, d); + d = qlf.mla(-PI_B2, d); } else if fabsk(d) < TRIGRANGEMAX { let dqh = trunck(d * (FRAC_1_PI / D1_24)) * D1_24; - let qlf = rintk(d.mul_add(FRAC_1_PI, -dqh)); + let qlf = rintk(d.mla(FRAC_1_PI, -dqh)); ql = qlf as isize; - d = dqh.mul_add(-PI_A, d); - d = qlf.mul_add(-PI_A, d); - d = dqh.mul_add(-PI_B, d); - d = qlf.mul_add(-PI_B, d); - d = dqh.mul_add(-PI_C, d); - d = qlf.mul_add(-PI_C, d); - d = (dqh + qlf).mul_add(-PI_D, d); + d = dqh.mla(-PI_A, d); + d = qlf.mla(-PI_A, d); + d = dqh.mla(-PI_B, d); + d = qlf.mla(-PI_B, d); + d = dqh.mla(-PI_C, d); + d = qlf.mla(-PI_C, d); + d = (dqh + qlf).mla(-PI_D, d); } else { let (mut ddidd, ddii) = rempi(t); ql = (((ddii & 3) * 2 + ((ddidd.0 > 0.) as i32) + 1) >> 2) as isize; @@ -64,12 +64,12 @@ pub fn sin(mut d: f64) -> f64 { -0.000_198_412_698_412_696_162_806_809, 0.008_333_333_333_333_329_748_238_15, ) - .mul_add(s, -0.166_666_666_666_666_657_414_808); + .mla(s, -0.166_666_666_666_666_657_414_808); if t.is_neg_zero() { t } else { - s.mul_add(u * d, d) + s.mla(u * d, d) } } @@ -87,23 +87,23 @@ pub fn cos(mut d: f64) -> f64 { let ql: isize; if fabsk(d) < TRIGRANGEMAX2 { - let qlf = (2_f64).mul_add(rintk(d * FRAC_1_PI - 0.5), 1.); + let qlf = (2_f64).mla(rintk(d * FRAC_1_PI - 0.5), 1.); ql = qlf as isize; - d = qlf.mul_add(-PI_A2 * 0.5, d); - d = qlf.mul_add(-PI_B2 * 0.5, d); + d = qlf.mla(-PI_A2 * 0.5, d); + d = qlf.mla(-PI_B2 * 0.5, d); } else if fabsk(d) < TRIGRANGEMAX { let mut dqh = trunck(d * (FRAC_1_PI / D1_23) - 0.5 * (FRAC_1_PI / D1_23)); let qlf = 2. * rintk(d * FRAC_1_PI - 0.5 - dqh * D1_23) + 1.; ql = qlf as isize; dqh *= D1_24; - d = dqh.mul_add(-PI_A * 0.5, d); - d = qlf.mul_add(-PI_A * 0.5, d); - d = dqh.mul_add(-PI_B * 0.5, d); - d = qlf.mul_add(-PI_B * 0.5, d); - d = dqh.mul_add(-PI_C * 0.5, d); - d = qlf.mul_add(-PI_C * 0.5, d); - d = (dqh + qlf).mul_add(-PI_D * 0.5, d); + d = dqh.mla(-PI_A * 0.5, d); + d = qlf.mla(-PI_A * 0.5, d); + d = dqh.mla(-PI_B * 0.5, d); + d = qlf.mla(-PI_B * 0.5, d); + d = dqh.mla(-PI_C * 0.5, d); + d = qlf.mla(-PI_C * 0.5, d); + d = (dqh + qlf).mla(-PI_D * 0.5, d); } else { let (mut ddidd, ddii) = rempi(t); ql = (((ddii & 3) * 2 + ((ddidd.0 > 0.) as i32) + 7) >> 1) as isize; @@ -142,9 +142,9 @@ pub fn cos(mut d: f64) -> f64 { -0.000_198_412_698_412_696_162_806_809, 0.008_333_333_333_333_329_748_238_15, ) - .mul_add(s, -0.166_666_666_666_666_657_414_808); + .mla(s, -0.166_666_666_666_666_657_414_808); - s.mul_add(u * d, d) + s.mla(u * d, d) } #[test] @@ -166,20 +166,20 @@ pub fn sincos(d: f64) -> (f64, f64) { if fabsk(d) < TRIGRANGEMAX2 { let qlf = rintk(s * FRAC_2_PI); ql = qlf as isize; - s = qlf.mul_add(-PI_A2 * 0.5, s); - s = qlf.mul_add(-PI_B2 * 0.5, s); + s = qlf.mla(-PI_A2 * 0.5, s); + s = qlf.mla(-PI_B2 * 0.5, s); } else if fabsk(d) < TRIGRANGEMAX { let dqh = trunck(d * (FRAC_2_PI / D1_24)) * D1_24; let qlf = rintk(d * FRAC_2_PI - dqh); ql = qlf as isize; - s = dqh.mul_add(-PI_A * 0.5, s); - s = qlf.mul_add(-PI_A * 0.5, s); - s = dqh.mul_add(-PI_B * 0.5, s); - s = qlf.mul_add(-PI_B * 0.5, s); - s = dqh.mul_add(-PI_C * 0.5, s); - s = qlf.mul_add(-PI_C * 0.5, s); - s = (dqh + qlf).mul_add(-PI_D * 0.5, s); + s = dqh.mla(-PI_A * 0.5, s); + s = qlf.mla(-PI_A * 0.5, s); + s = dqh.mla(-PI_B * 0.5, s); + s = qlf.mla(-PI_B * 0.5, s); + s = dqh.mla(-PI_C * 0.5, s); + s = qlf.mla(-PI_C * 0.5, s); + s = (dqh + qlf).mla(-PI_D * 0.5, s); } else { let (ddidd, ddii) = rempi(d); ql = ddii as isize; @@ -194,23 +194,23 @@ pub fn sincos(d: f64) -> (f64, f64) { s = s * s; let u = 1.589_383_072_832_289_373_285_11_e-10_f64 - .mul_add(s, -2.505_069_435_025_397_733_493_18_e-8) - .mul_add(s, 2.755_731_317_768_463_605_125_47_e-6) - .mul_add(s, -0.000_198_412_698_278_911_770_864_914) - .mul_add(s, 0.008_333_333_333_319_184_596_174_6) - .mul_add(s, -0.166_666_666_666_666_130_709_393) + .mla(s, -2.505_069_435_025_397_733_493_18_e-8) + .mla(s, 2.755_731_317_768_463_605_125_47_e-6) + .mla(s, -0.000_198_412_698_278_911_770_864_914) + .mla(s, 0.008_333_333_333_319_184_596_174_6) + .mla(s, -0.166_666_666_666_666_130_709_393) * s * t; let mut rsin = if d.is_neg_zero() { -0. } else { t + u }; let u = (-1.136_153_502_390_974_295_315_23_e-11_f64) - .mul_add(s, 2.087_574_712_070_400_554_793_66_e-9) - .mul_add(s, -2.755_731_440_288_475_674_985_67_e-7) - .mul_add(s, 2.480_158_728_900_018_673_119_15_e-5) - .mul_add(s, -0.001_388_888_888_887_140_192_823_29) - .mul_add(s, 0.041_666_666_666_666_551_959_206_2) - .mul_add(s, -0.5); + .mla(s, 2.087_574_712_070_400_554_793_66_e-9) + .mla(s, -2.755_731_440_288_475_674_985_67_e-7) + .mla(s, 2.480_158_728_900_018_673_119_15_e-5) + .mla(s, -0.001_388_888_888_887_140_192_823_29) + .mla(s, 0.041_666_666_666_666_551_959_206_2) + .mla(s, -0.5); let mut rcos = u * s + 1.; @@ -250,20 +250,20 @@ pub fn tan(d: f64) -> f64 { if fabsk(d) < TRIGRANGEMAX2 { let qlf = rintk(d * FRAC_2_PI); ql = qlf as isize; - x = qlf.mul_add(-PI_A2 * 0.5, d); - x = qlf.mul_add(-PI_B2 * 0.5, x); + x = qlf.mla(-PI_A2 * 0.5, d); + x = qlf.mla(-PI_B2 * 0.5, x); } else if fabsk(d) < 1e+6 { let dqh = trunck(d * (FRAC_2_PI / D1_24)) * D1_24; let qlf = rintk(d * FRAC_2_PI - dqh); ql = qlf as isize; - x = dqh.mul_add(-PI_A * 0.5, d); - x = qlf.mul_add(-PI_A * 0.5, x); - x = dqh.mul_add(-PI_B * 0.5, x); - x = qlf.mul_add(-PI_B * 0.5, x); - x = dqh.mul_add(-PI_C * 0.5, x); - x = qlf.mul_add(-PI_C * 0.5, x); - x = (dqh + qlf).mul_add(-PI_D * 0.5, x); + x = dqh.mla(-PI_A * 0.5, d); + x = qlf.mla(-PI_A * 0.5, x); + x = dqh.mla(-PI_B * 0.5, x); + x = qlf.mla(-PI_B * 0.5, x); + x = dqh.mla(-PI_C * 0.5, x); + x = qlf.mla(-PI_C * 0.5, x); + x = (dqh + qlf).mla(-PI_D * 0.5, x); } else { let (ddidd, ddii) = rempi(d); ql = ddii as isize; @@ -292,10 +292,10 @@ pub fn tan(d: f64) -> f64 { 0.539_682_539_951_727_297_e-1, 0.133_333_333_333_050_058_1, ) - .mul_add(s, 0.333_333_333_333_334_369_5); - u = s.mul_add(u * x, x); + .mla(s, 0.333_333_333_333_334_369_5); + u = s.mla(u * x, x); - let mut y = u.mul_add(u, -1.); + let mut y = u.mla(u, -1.); x = -2. * u; if (ql & 1) != 0 { @@ -330,25 +330,25 @@ pub fn sincospi(d: f64) -> (f64, f64) { // let u = 0.688_063_889_476_606_013_6_e-11_f64 - .mul_add(s, -0.175_715_956_454_231_019_9_e-8) - .mul_add(s, 0.313_361_632_725_786_731_1_e-6) - .mul_add(s, -0.365_762_041_638_848_645_2_e-4) - .mul_add(s, 0.249_039_457_018_993_210_3_e-2) - .mul_add(s, -0.807_455_121_882_805_632_e-1) - .mul_add(s, 0.785_398_163_397_448_279); + .mla(s, -0.175_715_956_454_231_019_9_e-8) + .mla(s, 0.313_361_632_725_786_731_1_e-6) + .mla(s, -0.365_762_041_638_848_645_2_e-4) + .mla(s, 0.249_039_457_018_993_210_3_e-2) + .mla(s, -0.807_455_121_882_805_632_e-1) + .mla(s, 0.785_398_163_397_448_279); let mut rsin = u * t; // let u = (-0.386_014_121_368_379_435_2_e-12_f64) - .mul_add(s, 0.115_005_788_802_968_141_5_e-9) - .mul_add(s, -0.246_113_649_300_666_355_3_e-7) - .mul_add(s, 0.359_086_044_662_351_671_3_e-5) - .mul_add(s, -0.325_991_886_926_943_594_2_e-3) - .mul_add(s, 0.158_543_442_438_154_116_9_e-1) - .mul_add(s, -0.308_425_137_534_042_437_3) - .mul_add(s, 1.); + .mla(s, 0.115_005_788_802_968_141_5_e-9) + .mla(s, -0.246_113_649_300_666_355_3_e-7) + .mla(s, 0.359_086_044_662_351_671_3_e-5) + .mla(s, -0.325_991_886_926_943_594_2_e-3) + .mla(s, 0.158_543_442_438_154_116_9_e-1) + .mla(s, -0.308_425_137_534_042_437_3) + .mla(s, 1.); let mut rcos = u; @@ -527,7 +527,7 @@ pub fn asin(d: f64) -> f64 { 0.750_000_000_037_858_161_1_e-1, 0.166_666_666_666_649_754_3, ) - .mul_add(x * x2, x); + .mla(x * x2, x); let r = if o { u } else { FRAC_PI_2 - 2. * u }; r.mul_sign(d) @@ -657,8 +657,8 @@ fn test_atan() { fn expm1k(d: f64) -> f64 { let q = rintk(d * R_LN2); - let s = q.mul_add(-L2_U, d); - let s = q.mul_add(-L2_L, s); + let s = q.mla(-L2_U, d); + let s = q.mla(-L2_L, s); let s2 = s * s; let s4 = s2 * s2; @@ -681,7 +681,7 @@ fn expm1k(d: f64) -> f64 { 0.166_666_666_666_666_851_703_837, ); - u = s2.mul_add(0.5, s2 * s * u) + s; + u = s2.mla(0.5, s2 * s * u) + s; let q = q as i32; if q != 0 { @@ -838,15 +838,15 @@ pub fn log2(mut d: f64) -> f64 { let x2 = x * x; let t = 0.221_194_175_045_608_149 - .mul_add(x2, 0.220_076_869_315_227_768_9) - .mul_add(x2, 0.262_370_805_748_851_465_6) - .mul_add(x2, 0.320_597_747_794_449_550_2) - .mul_add(x2, 0.412_198_594_548_532_470_9) - .mul_add(x2, 0.577_078_016_299_705_898_2) - .mul_add(x2, 0.961_796_693_926_080_914_49); + .mla(x2, 0.220_076_869_315_227_768_9) + .mla(x2, 0.262_370_805_748_851_465_6) + .mla(x2, 0.320_597_747_794_449_550_2) + .mla(x2, 0.412_198_594_548_532_470_9) + .mla(x2, 0.577_078_016_299_705_898_2) + .mla(x2, 0.961_796_693_926_080_914_49); let s = (e as f64).add_checked((2.885_390_081_777_926_774).mul_as_doubled(x)); - let r = t.mul_add(x * x2, f64::from(s)); + let r = t.mla(x * x2, f64::from(s)); if d == 0. { f64::NEG_INFINITY @@ -870,21 +870,21 @@ fn test_log2() { pub fn exp10(d: f64) -> f64 { let q = rintk(d * LOG10_2); - let mut s = q.mul_add(-L10_U, d); - s = q.mul_add(-L10_L, s); + let mut s = q.mla(-L10_U, d); + s = q.mla(-L10_L, s); let mut u = 0.241_146_349_833_426_765_2_e-3 - .mul_add(s, 0.115_748_841_521_718_737_5_e-2) - .mul_add(s, 0.501_397_554_678_973_365_9_e-2) - .mul_add(s, 0.195_976_232_072_053_308_e-1) - .mul_add(s, 0.680_893_639_944_678_413_8_e-1) - .mul_add(s, 0.206_995_849_472_267_623_4) - .mul_add(s, 0.539_382_929_205_853_622_9) - .mul_add(s, 0.117_125_514_890_854_165_5_e+1) - .mul_add(s, 0.203_467_859_229_343_295_3_e+1) - .mul_add(s, 0.265_094_905_523_920_587_6_e+1) - .mul_add(s, 0.230_258_509_299_404_590_1_e+1) - .mul_add(s, 0.1_e+1); + .mla(s, 0.115_748_841_521_718_737_5_e-2) + .mla(s, 0.501_397_554_678_973_365_9_e-2) + .mla(s, 0.195_976_232_072_053_308_e-1) + .mla(s, 0.680_893_639_944_678_413_8_e-1) + .mla(s, 0.206_995_849_472_267_623_4) + .mla(s, 0.539_382_929_205_853_622_9) + .mla(s, 0.117_125_514_890_854_165_5_e+1) + .mla(s, 0.203_467_859_229_343_295_3_e+1) + .mla(s, 0.265_094_905_523_920_587_6_e+1) + .mla(s, 0.230_258_509_299_404_590_1_e+1) + .mla(s, 0.1_e+1); u = ldexp2k(u, q as i32); @@ -911,17 +911,17 @@ pub fn exp2(d: f64) -> f64 { let s = d - q; let mut u = 0.443_435_908_292_652_945_4_e-9 - .mul_add(s, 0.707_316_459_808_570_742_5_e-8) - .mul_add(s, 0.101_781_926_092_176_045_1_e-6) - .mul_add(s, 0.132_154_387_251_132_761_5_e-5) - .mul_add(s, 0.152_527_335_351_758_473_e-4) - .mul_add(s, 0.154_035_304_510_114_780_8_e-3) - .mul_add(s, 0.133_335_581_467_049_907_3_e-2) - .mul_add(s, 0.961_812_910_759_760_053_6_e-2) - .mul_add(s, 0.555_041_086_648_204_659_6_e-1) - .mul_add(s, 0.240_226_506_959_101_221_4) - .mul_add(s, 0.693_147_180_559_945_286_2) - .mul_add(s, 0.1_e+1); + .mla(s, 0.707_316_459_808_570_742_5_e-8) + .mla(s, 0.101_781_926_092_176_045_1_e-6) + .mla(s, 0.132_154_387_251_132_761_5_e-5) + .mla(s, 0.152_527_335_351_758_473_e-4) + .mla(s, 0.154_035_304_510_114_780_8_e-3) + .mla(s, 0.133_335_581_467_049_907_3_e-2) + .mla(s, 0.961_812_910_759_760_053_6_e-2) + .mla(s, 0.555_041_086_648_204_659_6_e-1) + .mla(s, 0.240_226_506_959_101_221_4) + .mla(s, 0.693_147_180_559_945_286_2) + .mla(s, 0.1_e+1); u = ldexp2k(u, q as i32); @@ -972,11 +972,11 @@ pub fn cbrt(mut d: f64) -> f64 { d = fabsk(d); let mut x = (-0.640_245_898_480_692_909_870_982_f64) - .mul_add(d, 2.961_551_030_200_395_118_185_95) - .mul_add(d, -5.733_530_609_229_478_436_361_66) - .mul_add(d, 6.039_903_689_894_587_479_614_07) - .mul_add(d, -3.858_419_355_104_449_888_216_32) - .mul_add(d, 2.230_727_530_249_660_972_572_2); + .mla(d, 2.961_551_030_200_395_118_185_95) + .mla(d, -5.733_530_609_229_478_436_361_66) + .mla(d, 6.039_903_689_894_587_479_614_07) + .mla(d, -3.858_419_355_104_449_888_216_32) + .mla(d, 2.230_727_530_249_660_972_572_2); let mut y = x * x; y = y * y; diff --git a/src/f64x.rs b/src/f64x.rs index 9e07170..4736016 100644 --- a/src/f64x.rs +++ b/src/f64x.rs @@ -525,10 +525,10 @@ macro_rules! impl_math_f64 { impl MulAdd for F64x { #[inline] - fn mul_add(self, y: Self, z: Self) -> Self { + fn mla(self, y: Self, z: Self) -> Self { if cfg!(target_feature = "fma") { use std::simd::{StdFloat}; - ::mul_add(self, y, z) + self.mul_add(y, z) } else { self * y + z } @@ -540,7 +540,7 @@ macro_rules! impl_math_f64 { fn mul_sub(self, y: Self, z: Self) -> Self { if cfg!(target_feature = "fma") { use std::simd::{StdFloat}; - ::mul_add(self, y, -z) + self.mul_add(y, -z) } else { self * y - z } @@ -552,7 +552,7 @@ macro_rules! impl_math_f64 { fn neg_mul_add(self, y: Self, z: Self) -> Self { if cfg!(target_feature = "fma") { use std::simd::{StdFloat}; - ::mul_add(-self, y, z) + (-self).mul_add(y, z) } else { -self * y + z } @@ -657,7 +657,7 @@ macro_rules! impl_math_f64 { self.trunc().simd_eq(self) } else { let mut x = (self * (ONE / D1_31X)).trunc(); - x = (-D1_31X).mul_add(x, self); + x = (-D1_31X).mla(x, self); x.trunc().simd_eq(x) | self.abs().simd_gt(D1_53X) } } @@ -723,7 +723,7 @@ macro_rules! impl_math_f64 { x.trunc().simd_ne(x) } else { let mut x = (self * (ONE / D1_31X)).trunc(); - x = (-D1_31X).mul_add(x, self); + x = (-D1_31X).mla(x, self); (x.trunci() & Ix::splat(1)).simd_eq(Ix::splat(1)).cast::() & self.abs().simd_lt(D1_53X) } @@ -769,12 +769,12 @@ macro_rules! impl_math_f64 { let c = D1_52X.mul_sign(x); let rint4x = (F64x::splat(4.) * x).abs().simd_gt(D1_52X).select( (F64x::splat(4.) * x), - (F64x::splat(4.).mul_add(x, c) - c).or_sign(x) + (F64x::splat(4.).mla(x, c) - c).or_sign(x) ); let rintx = x.abs().simd_gt(D1_52X).select(x, ((x + c) - c).or_sign(x)); - let fr = F64x::splat(-0.25).mul_add(rint4x, x); - let vi = F64x::splat(-4.).mul_add(rintx, rint4x).trunci(); + let fr = F64x::splat(-0.25).mla(rint4x, x); + let vi = F64x::splat(-4.).mla(rintx, rint4x).trunci(); (fr, vi) } } @@ -1141,7 +1141,7 @@ macro_rules! impl_math_f64 { #[cfg(not(feature = "full_fp_rounding"))] #[inline] fn trunc_positive(x: F64x) -> F64x { - let mut fr = (-D1_31X).mul_add((x * (ONE / D1_31X)).trunci().cast(), x); + let mut fr = (-D1_31X).mla((x * (ONE / D1_31X)).trunci().cast(), x); fr -= fr.trunci().cast(); x.abs().simd_ge(D1_52X).select(x, x - fr) } @@ -1256,35 +1256,35 @@ macro_rules! impl_math_f64 { 9.944_803_876_268_437_740_902_08_e-16, -2.024_611_207_851_823_992_958_68_e-14, ) - .mul_add( + .mla( s, o.select_splat( -3.897_962_260_629_327_991_640_47_e-13, 6.948_218_305_801_794_613_277_84_e-12, ), ) - .mul_add( + .mla( s, o.select_splat( 1.150_115_825_399_960_352_669_01_e-10, -1.757_247_499_528_531_799_526_64_e-9, ), ) - .mul_add( + .mla( s, o.select_splat( -2.461_136_950_104_469_749_535_9_e-8, 3.133_616_889_668_683_928_784_22_e-7, ), ) - .mul_add( + .mla( s, o.select_splat( 3.590_860_448_590_527_540_050_62_e-6, -3.657_620_418_216_155_192_036_1_e-5, ), ) - .mul_add( + .mla( s, o.select_splat( -0.000_325_991_886_927_389_905_997_954, diff --git a/src/f64x/u05_impl.rs b/src/f64x/u05_impl.rs index aa439db..5fab595 100644 --- a/src/f64x/u05_impl.rs +++ b/src/f64x/u05_impl.rs @@ -21,11 +21,11 @@ macro_rules! impl_math_f64_u05 { // let u = F64x::splat(-2.024_611_207_851_823_992_958_68_e-14) - .mul_add(s, F64x::splat(6.948_218_305_801_794_613_277_84_e-12)) - .mul_add(s, F64x::splat(-1.757_247_499_528_531_799_526_64_e-9)) - .mul_add(s, F64x::splat(3.133_616_889_668_683_928_784_22_e-7)) - .mul_add(s, F64x::splat(-3.657_620_418_216_155_192_036_1_e-5)) - .mul_add(s, F64x::splat(0.002_490_394_570_192_718_502_743_56)); + .mla(s, F64x::splat(6.948_218_305_801_794_613_277_84_e-12)) + .mla(s, F64x::splat(-1.757_247_499_528_531_799_526_64_e-9)) + .mla(s, F64x::splat(3.133_616_889_668_683_928_784_22_e-7)) + .mla(s, F64x::splat(-3.657_620_418_216_155_192_036_1_e-5)) + .mla(s, F64x::splat(0.002_490_394_570_192_718_502_743_56)); let mut x = u * s + Doubled::new( F64x::splat(-0.080_745_512_188_280_785_248_473_1), @@ -45,11 +45,11 @@ macro_rules! impl_math_f64_u05 { // let u = F64x::splat(9.944_803_876_268_437_740_902_08_e-16) - .mul_add(s, F64x::splat(-3.897_962_260_629_327_991_640_47_e-13)) - .mul_add(s, F64x::splat(1.150_115_825_399_960_352_669_01_e-10)) - .mul_add(s, F64x::splat(-2.461_136_950_104_469_749_535_9_e-8)) - .mul_add(s, F64x::splat(3.590_860_448_590_527_540_050_62_e-6)) - .mul_add(s, F64x::splat(-0.000_325_991_886_927_389_905_997_954)); + .mla(s, F64x::splat(-3.897_962_260_629_327_991_640_47_e-13)) + .mla(s, F64x::splat(1.150_115_825_399_960_352_669_01_e-10)) + .mla(s, F64x::splat(-2.461_136_950_104_469_749_535_9_e-8)) + .mla(s, F64x::splat(3.590_860_448_590_527_540_050_62_e-6)) + .mla(s, F64x::splat(-0.000_325_991_886_927_389_905_997_954)); let mut x = u * s + Doubled::new( F64x::splat(0.015_854_344_243_815_501_891_425_9), @@ -160,35 +160,35 @@ macro_rules! impl_math_f64_u05 { 9.944_803_876_268_437_740_902_08_e-16, -2.024_611_207_851_823_992_958_68_e-14, ) - .mul_add( + .mla( s, o.select_splat( -3.897_962_260_629_327_991_640_47_e-13, 6.948_218_305_801_794_613_277_84_e-12, ), ) - .mul_add( + .mla( s, o.select_splat( 1.150_115_825_399_960_352_669_01_e-10, -1.757_247_499_528_531_799_526_64_e-9, ), ) - .mul_add( + .mla( s, o.select_splat( -2.461_136_950_104_469_749_535_9_e-8, 3.133_616_889_668_683_928_784_22_e-7, ), ) - .mul_add( + .mla( s, o.select_splat( 3.590_860_448_590_527_540_050_62_e-6, -3.657_620_418_216_155_192_036_1_e-5, ), ) - .mul_add( + .mla( s, o.select_splat( -0.000_325_991_886_927_389_905_997_954, diff --git a/src/f64x/u10_impl.rs b/src/f64x/u10_impl.rs index 56316fc..cb654a6 100644 --- a/src/f64x/u10_impl.rs +++ b/src/f64x/u10_impl.rs @@ -13,7 +13,7 @@ macro_rules! impl_math_f64_u10 { if d.abs().simd_lt(TRIGRANGEMAX2).all() { let dql = (d * FRAC_1_PI).round(); ql = dql.roundi(); - let u = dql.mul_add(-PI_A2, d); + let u = dql.mla(-PI_A2, d); s = u.add_checked_as_doubled(dql * (-PI_B2)); } else if d.abs().simd_lt(TRIGRANGEMAX).all() { let dqh = (d * (FRAC_1_PI / D1_24X)).trunc(); @@ -21,7 +21,7 @@ macro_rules! impl_math_f64_u10 { let dql = (d.mul_sub(FRAC_1_PI, dqh)).round(); ql = dql.roundi(); - let u = dqh.mul_add(-PI_A, d); + let u = dqh.mla(-PI_A, d); s = u.add_checked_as_doubled(dql * (-PI_A)); s += dqh * (-PI_B); s += dql * (-PI_B); @@ -59,7 +59,7 @@ macro_rules! impl_math_f64_u10 { -2.505_210_681_484_312_335_936_8_e-8, 2.755_731_921_044_282_247_773_79_e-6, -0.000_198_412_698_412_046_454_654_947) - .mul_add(s.0, F64x::splat(0.008_333_333_333_333_180_562_019_22)); + .mla(s.0, F64x::splat(0.008_333_333_333_333_180_562_019_22)); let x = ONE.add_checked( (F64x::splat(-0.166_666_666_666_666_657_414_808).add_checked_as_doubled(u * s.0)) * s, @@ -88,7 +88,7 @@ macro_rules! impl_math_f64_u10 { let g = d.abs().simd_lt(TRIGRANGEMAX2); let dql = (d * FRAC_1_PI).round(); ql = dql.roundi(); - let u = dql.mul_add(-PI_A2, d); + let u = dql.mla(-PI_A2, d); let mut x = u.add_checked_as_doubled(dql * (-PI_B2)); if !g.all() { @@ -96,7 +96,7 @@ macro_rules! impl_math_f64_u10 { dqh *= D1_24X; let dql = d.mul_sub(FRAC_1_PI, dqh).round(); - let u = dqh.mul_add(-PI_A, d); + let u = dqh.mla(-PI_A, d); s = u.add_checked_as_doubled(dql * (-PI_A)); s += dqh * (-PI_B); s += dql * (-PI_B); @@ -142,7 +142,7 @@ macro_rules! impl_math_f64_u10 { -2.505_210_681_484_312_335_936_8_e-8, 2.755_731_921_044_282_247_773_79_e-6, -0.000_198_412_698_412_046_454_654_947) - .mul_add(s.0, F64x::splat(0.008_333_333_333_333_180_562_019_22)); + .mla(s.0, F64x::splat(0.008_333_333_333_333_180_562_019_22)); x = ONE.add_checked( F64x::splat(-0.166_666_666_666_666_657_414_808).add_checked_as_doubled(u * s.0) * s, @@ -184,21 +184,21 @@ macro_rules! impl_math_f64_u10 { let mut ql; if d.abs().simd_lt(TRIGRANGEMAX2).all() { - let dql = d.mul_add(FRAC_1_PI, F64x::splat(-0.5)).round(); - let dql = F64x::splat(2.).mul_add(dql, ONE); + let dql = d.mla(FRAC_1_PI, F64x::splat(-0.5)).round(); + let dql = F64x::splat(2.).mla(dql, ONE); ql = dql.roundi(); s = d.add_as_doubled(dql * (-PI_A2) * HALF); s = s.add_checked(dql * (-PI_B2) * HALF); } else if d.abs().simd_lt(TRIGRANGEMAX).all() { let dqh = d - .mul_add(FRAC_1_PI / D1_23X, -FRAC_1_PI / D1_24X) + .mla(FRAC_1_PI / D1_23X, -FRAC_1_PI / D1_24X) .trunc(); - ql = (d * FRAC_1_PI + dqh.mul_add(-D1_23X, F64x::splat(-0.5))).roundi(); + ql = (d * FRAC_1_PI + dqh.mla(-D1_23X, F64x::splat(-0.5))).roundi(); let dqh = dqh * D1_24X; ql = ql + ql + Ix::splat(1); let dql: F64x = ql.cast(); - let u = dqh.mul_add(-PI_A * HALF, d); + let u = dqh.mla(-PI_A * HALF, d); s = u.add_as_doubled(dql * -PI_A * HALF); s += dqh * (-PI_B) * HALF; s += dql * (-PI_B) * HALF; @@ -237,7 +237,7 @@ macro_rules! impl_math_f64_u10 { -2.505_210_681_484_312_335_936_8_e-8, 2.755_731_921_044_282_247_773_79_e-6, -0.000_198_412_698_412_046_454_654_947) - .mul_add(s.0, F64x::splat(0.008_333_333_333_333_180_562_019_22)); + .mla(s.0, F64x::splat(0.008_333_333_333_333_180_562_019_22)); let x = ONE.add_checked( (F64x::splat(-0.166_666_666_666_666_657_414_808).add_checked_as_doubled(u * s.0)) * s, @@ -260,20 +260,20 @@ macro_rules! impl_math_f64_u10 { /// NOTE: This version is slower, but SIMD lanes are independent pub fn cos_deterministic(d: F64x) -> F64x { let g = d.abs().simd_lt(TRIGRANGEMAX2); - let mut dql = d.mul_add(FRAC_1_PI, F64x::splat(-0.5)).round(); - dql = F64x::splat(2.).mul_add(dql, ONE); + let mut dql = d.mla(FRAC_1_PI, F64x::splat(-0.5)).round(); + dql = F64x::splat(2.).mla(dql, ONE); let mut ql = dql.roundi(); let mut x = d.add_as_doubled(dql * (-PI_A2 * HALF)); x = x.add_checked(dql * (-PI_B2 * HALF)); if !g.all() { - let mut dqh = (d.mul_add(FRAC_1_PI / D1_23X, -FRAC_1_PI / D1_24X)).trunc(); - let mut ql2 = (d * FRAC_1_PI + dqh.mul_add(-D1_23X, F64x::splat(-0.5))).roundi(); + let mut dqh = (d.mla(FRAC_1_PI / D1_23X, -FRAC_1_PI / D1_24X)).trunc(); + let mut ql2 = (d * FRAC_1_PI + dqh.mla(-D1_23X, F64x::splat(-0.5))).roundi(); dqh *= D1_24X; ql2 = ql2 + ql2 + Ix::splat(1); let dql: F64x = ql2.cast(); - let u = dqh.mul_add(-PI_A * HALF, d); + let u = dqh.mla(-PI_A * HALF, d); let mut s = u.add_as_doubled(dql * (-PI_A * HALF)); s += dqh * (-PI_B * HALF); s += dql * (-PI_B * HALF); @@ -320,7 +320,7 @@ macro_rules! impl_math_f64_u10 { -2.505_210_681_484_312_335_936_8_e-8, 2.755_731_921_044_282_247_773_79_e-6, -0.000_198_412_698_412_046_454_654_947) - .mul_add(s.0, F64x::splat(0.008_333_333_333_333_180_562_019_22)); + .mla(s.0, F64x::splat(0.008_333_333_333_333_180_562_019_22)); x = ONE.add_checked( F64x::splat(-0.166_666_666_666_666_657_414_808).add_checked_as_doubled(u * s.0) * s, @@ -365,7 +365,7 @@ macro_rules! impl_math_f64_u10 { if d.abs().simd_lt(TRIGRANGEMAX2).all() { let dql = (d * FRAC_2_PI).round(); ql = dql.roundi(); - let u = dql.mul_add(-PI_A2 * HALF, d); + let u = dql.mla(-PI_A2 * HALF, d); s = u.add_checked_as_doubled(dql * (-PI_B2) * HALF); } else if d.abs().simd_lt(TRIGRANGEMAX).all() { let dqh = (d * (FRAC_2_PI / D1_24X)).trunc(); @@ -373,7 +373,7 @@ macro_rules! impl_math_f64_u10 { let dql = (d * FRAC_2_PI - dqh).round(); ql = dql.roundi(); - let u = dqh.mul_add(-PI_A * HALF, d); + let u = dqh.mla(-PI_A * HALF, d); s = u.add_checked_as_doubled(dql * (-PI_A) * HALF); s += dqh * (-PI_B) * HALF; s += dql * (-PI_B) * HALF; @@ -396,11 +396,11 @@ macro_rules! impl_math_f64_u10 { s.0 = s.square_as_f(); let u = F64x::splat(1.589_383_072_832_289_373_285_11_e-10) - .mul_add(s.0, F64x::splat(-2.505_069_435_025_397_733_493_18_e-8)) - .mul_add(s.0, F64x::splat(2.755_731_317_768_463_605_125_47_e-6)) - .mul_add(s.0, F64x::splat(-0.000_198_412_698_278_911_770_864_914)) - .mul_add(s.0, F64x::splat(0.008_333_333_333_319_184_596_174_6)) - .mul_add(s.0, F64x::splat(-0.166_666_666_666_666_130_709_393)) + .mla(s.0, F64x::splat(-2.505_069_435_025_397_733_493_18_e-8)) + .mla(s.0, F64x::splat(2.755_731_317_768_463_605_125_47_e-6)) + .mla(s.0, F64x::splat(-0.000_198_412_698_278_911_770_864_914)) + .mla(s.0, F64x::splat(0.008_333_333_333_319_184_596_174_6)) + .mla(s.0, F64x::splat(-0.166_666_666_666_666_130_709_393)) * (s.0 * t.0); let x = t.add_checked(u); @@ -409,12 +409,12 @@ macro_rules! impl_math_f64_u10 { let rx = d.is_neg_zero().select(NEG_ZERO, rx); let u = F64x::splat(-1.136_153_502_390_974_295_315_23_e-11) - .mul_add(s.0, F64x::splat(2.087_574_712_070_400_554_793_66_e-9)) - .mul_add(s.0, F64x::splat(-2.755_731_440_288_475_674_985_67_e-7)) - .mul_add(s.0, F64x::splat(2.480_158_728_900_018_673_119_15_e-5)) - .mul_add(s.0, F64x::splat(-0.001_388_888_888_887_140_192_823_29)) - .mul_add(s.0, F64x::splat(0.041_666_666_666_666_551_959_206_2)) - .mul_add(s.0, F64x::splat(-0.5)); + .mla(s.0, F64x::splat(2.087_574_712_070_400_554_793_66_e-9)) + .mla(s.0, F64x::splat(-2.755_731_440_288_475_674_985_67_e-7)) + .mla(s.0, F64x::splat(2.480_158_728_900_018_673_119_15_e-5)) + .mla(s.0, F64x::splat(-0.001_388_888_888_887_140_192_823_29)) + .mla(s.0, F64x::splat(0.041_666_666_666_666_551_959_206_2)) + .mla(s.0, F64x::splat(-0.5)); let x = ONE.add_checked(s.0.mul_as_doubled(u)); let ry = F64x::from(x); @@ -448,7 +448,7 @@ macro_rules! impl_math_f64_u10 { pub fn sincos_deterministic(d: F64x) -> (F64x, F64x) { let dql = (d * FRAC_2_PI).round(); let mut ql = dql.roundi(); - let u = dql.mul_add(-PI_A2 * HALF, d); + let u = dql.mla(-PI_A2 * HALF, d); let mut s = u.add_checked_as_doubled(dql * (-PI_B2 * HALF)); let g = d.abs().simd_lt(TRIGRANGEMAX2); @@ -457,7 +457,7 @@ macro_rules! impl_math_f64_u10 { dqh *= D1_24X; let dql = (d * FRAC_2_PI - dqh).round(); - let u = dqh.mul_add(-PI_A * HALF, d); + let u = dqh.mla(-PI_A * HALF, d); let mut x = u.add_checked_as_doubled(dql * (-PI_A * HALF)); x += dqh * (-PI_B * HALF); x += dql * (-PI_B * HALF); @@ -488,11 +488,11 @@ macro_rules! impl_math_f64_u10 { s.0 = s.square_as_f(); let u = F64x::splat(1.589_383_072_832_289_373_285_11_e-10) - .mul_add(s.0, F64x::splat(-2.505_069_435_025_397_733_493_18_e-8)) - .mul_add(s.0, F64x::splat(2.755_731_317_768_463_605_125_47_e-6)) - .mul_add(s.0, F64x::splat(-0.000_198_412_698_278_911_770_864_914)) - .mul_add(s.0, F64x::splat(0.008_333_333_333_319_184_596_174_6)) - .mul_add(s.0, F64x::splat(-0.166_666_666_666_666_130_709_393)) + .mla(s.0, F64x::splat(-2.505_069_435_025_397_733_493_18_e-8)) + .mla(s.0, F64x::splat(2.755_731_317_768_463_605_125_47_e-6)) + .mla(s.0, F64x::splat(-0.000_198_412_698_278_911_770_864_914)) + .mla(s.0, F64x::splat(0.008_333_333_333_319_184_596_174_6)) + .mla(s.0, F64x::splat(-0.166_666_666_666_666_130_709_393)) * (s.0 * t.0); let x = t.add_checked(u); @@ -501,12 +501,12 @@ macro_rules! impl_math_f64_u10 { rx = d.is_neg_zero().select(NEG_ZERO, rx); let u = F64x::splat(-1.136_153_502_390_974_295_315_23_e-11) - .mul_add(s.0, F64x::splat(2.087_574_712_070_400_554_793_66_e-9)) - .mul_add(s.0, F64x::splat(-2.755_731_440_288_475_674_985_67_e-7)) - .mul_add(s.0, F64x::splat(2.480_158_728_900_018_673_119_15_e-5)) - .mul_add(s.0, F64x::splat(-0.001_388_888_888_887_140_192_823_29)) - .mul_add(s.0, F64x::splat(0.041_666_666_666_666_551_959_206_2)) - .mul_add(s.0, F64x::splat(-0.5)); + .mla(s.0, F64x::splat(2.087_574_712_070_400_554_793_66_e-9)) + .mla(s.0, F64x::splat(-2.755_731_440_288_475_674_985_67_e-7)) + .mla(s.0, F64x::splat(2.480_158_728_900_018_673_119_15_e-5)) + .mla(s.0, F64x::splat(-0.001_388_888_888_887_140_192_823_29)) + .mla(s.0, F64x::splat(0.041_666_666_666_666_551_959_206_2)) + .mla(s.0, F64x::splat(-0.5)); let x = ONE.add_checked(s.0.mul_as_doubled(u)); let ry = F64x::from(x); @@ -561,7 +561,7 @@ macro_rules! impl_math_f64_u10 { if d.abs().simd_lt(TRIGRANGEMAX2).all() { let dql = (d * FRAC_2_PI).round(); ql = dql.roundi(); - let u = dql.mul_add(-PI_A2 * HALF, d); + let u = dql.mla(-PI_A2 * HALF, d); s = u.add_checked_as_doubled(dql * (-PI_B2) * HALF); } else if d.abs().simd_lt(TRIGRANGEMAX).all() { let dqh = (d * (FRAC_2_PI / D1_24X)).trunc(); @@ -571,7 +571,7 @@ macro_rules! impl_math_f64_u10 { let dql = F64x::from(s).trunc(); ql = dql.roundi(); - let u = dqh.mul_add(-PI_A * HALF, d); + let u = dqh.mla(-PI_A * HALF, d); s = u.add_checked_as_doubled(dql * (-PI_A) * HALF); s += dqh * (-PI_B) * HALF; s += dql * (-PI_B) * HALF; @@ -603,7 +603,7 @@ macro_rules! impl_math_f64_u10 { 0.218_694_872_818_553_549_8_e-1, 0.539_682_539_951_727_297_e-1, 0.133_333_333_333_050_058_1) - .mul_add(s.0, F64x::splat(0.333_333_333_333_334_369_5)); + .mla(s.0, F64x::splat(0.333_333_333_333_334_369_5)); let mut x = t.add_checked(s * t * u); let y = (-ONE).add_checked(x.square()); @@ -627,7 +627,7 @@ macro_rules! impl_math_f64_u10 { pub fn tan_deterministic(d: F64x) -> F64x { let dql = (d * FRAC_2_PI).round(); let mut ql = dql.roundi(); - let u = dql.mul_add(-PI_A2 * HALF, d); + let u = dql.mla(-PI_A2 * HALF, d); let mut s = u.add_checked_as_doubled(dql * (-PI_B2 * HALF)); let g = d.abs().simd_lt(TRIGRANGEMAX2); @@ -638,7 +638,7 @@ macro_rules! impl_math_f64_u10 { + (d.simd_lt(ZERO).select(F64x::splat(-0.5), HALF) - dqh); let dql = F64x::from(x).trunc(); - let u = dqh.mul_add(-PI_A * HALF, d); + let u = dqh.mla(-PI_A * HALF, d); x = u.add_checked_as_doubled(dql * (-PI_A * HALF)); x += dqh * (-PI_B * HALF); x += dql * (-PI_B * HALF); @@ -678,7 +678,7 @@ macro_rules! impl_math_f64_u10 { 0.218_694_872_818_553_549_8_e-1, 0.539_682_539_951_727_297_e-1, 0.133_333_333_333_050_058_1) - .mul_add(s.0, F64x::splat(0.333_333_333_333_334_369_5)); + .mla(s.0, F64x::splat(0.333_333_333_333_334_369_5)); let mut x = t.add_checked(s * t * u); let y = (-ONE).add_checked(x.square()); @@ -749,10 +749,10 @@ macro_rules! impl_math_f64_u10 { -0.066_662_088_477_879_549_719_418_2, 0.076_922_533_029_620_376_865_409_5, -0.090_909_044_277_338_757_478_190_7) - .mul_add(t.0, F64x::splat(0.111_111_108_376_896_236_538_123)) - .mul_add(t.0, F64x::splat(-0.142_857_142_756_268_568_062_339)) - .mul_add(t.0, F64x::splat(0.199_999_999_997_977_351_284_817)) - .mul_add(t.0, F64x::splat(-0.333_333_333_333_317_605_173_818)); + .mla(t.0, F64x::splat(0.111_111_108_376_896_236_538_123)) + .mla(t.0, F64x::splat(-0.142_857_142_756_268_568_062_339)) + .mla(t.0, F64x::splat(0.199_999_999_997_977_351_284_817)) + .mla(t.0, F64x::splat(-0.333_333_333_333_317_605_173_818)); t = s.add_checked(s * t * u); (Doubled::new( @@ -1038,7 +1038,7 @@ macro_rules! impl_math_f64_u10 { 0.285_714_285_511_134_091_777_308, 0.400_000_000_000_914_013_309_483, ) - .mul_add(x2.0, F64x::splat(0.666_666_666_666_664_853_302_393)); + .mla(x2.0, F64x::splat(0.666_666_666_666_664_853_302_393)); let mut s = Doubled::::splat(crate::f64::D_LN2) * e.cast(); s = s.add_checked(x.scale(F64x::splat(2.))); @@ -1157,12 +1157,12 @@ macro_rules! impl_math_f64_u10 { let x2 = x.0 * x.0; let t = F64x::splat(0.153_207_698_850_270_135_3) - .mul_add(x2, F64x::splat(0.152_562_905_100_342_871_6)) - .mul_add(x2, F64x::splat(0.181_860_593_293_778_599_6)) - .mul_add(x2, F64x::splat(0.222_221_451_983_938_000_9)) - .mul_add(x2, F64x::splat(0.285_714_293_279_429_931_7)) - .mul_add(x2, F64x::splat(0.399_999_999_963_525_199)) - .mul_add(x2, F64x::splat(0.666_666_666_666_733_354_1)); + .mla(x2, F64x::splat(0.152_562_905_100_342_871_6)) + .mla(x2, F64x::splat(0.181_860_593_293_778_599_6)) + .mla(x2, F64x::splat(0.222_221_451_983_938_000_9)) + .mla(x2, F64x::splat(0.285_714_293_279_429_931_7)) + .mla(x2, F64x::splat(0.399_999_999_963_525_199)) + .mla(x2, F64x::splat(0.666_666_666_666_733_354_1)); s = s.add_checked(x.scale(F64x::splat(2.))); s = s.add_checked(x2 * x.0 * t); @@ -1353,14 +1353,14 @@ macro_rules! impl_math_f64_u10 { dp1 = o.select(dp1 * (D1_32X * D1_32X), dp1); let mut e = ilogb2k(dp1 * F64x::splat(1. / 0.75)); let t = ldexp3k(ONE, -e); - m = d.mul_add(t, t - ONE); + m = d.mla(t, t - ONE); e = o.cast().select(e - Ix::splat(64), e); Doubled::::splat(crate::f64::D_LN2) * e.cast() }/* else { let e = vgetexp_vd_vd(dp1, F64x::splat(1. / 0.75)); e = e.simd_eq(INFINITY).select(F64x::splat(1024.), e); let t = ldexp3k(ONE, -e.roundi()); - m = d.mul_add(t, t - ONE); + m = d.mla(t, t - ONE); Doubled::::splat(crate::f64::D_LN2) * e }*/; @@ -1407,8 +1407,8 @@ macro_rules! impl_math_f64_u10 { let mut u = (d * R_LN2).round(); let q = u.roundi(); - let s = u.mul_add(-L2_U, d); - let s = u.mul_add(-L2_L, s); + let s = u.mla(-L2_U, d); + let s = u.mla(-L2_L, s); if cfg!(target_feature = "fma") { let s2 = s * s; @@ -1425,9 +1425,9 @@ macro_rules! impl_math_f64_u10 { 0.833_333_333_331_493_821_e-2, 0.416_666_666_666_660_259_8_e-1, 0.166_666_666_666_666_907_2) - .mul_add(s, HALF) - .mul_add(s, ONE) - .mul_add(s, ONE); + .mla(s, HALF) + .mla(s, ONE) + .mla(s, ONE); } else { let s2 = s * s; @@ -1445,9 +1445,9 @@ macro_rules! impl_math_f64_u10 { 0.008_333_333_333_316_527_216_649_84, 0.041_666_666_666_666_504_759_142_2, 0.166_666_666_666_666_851_703_837) - .mul_add(s, HALF); + .mla(s, HALF); - u = ONE + (s * s).mul_add(u, s); + u = ONE + (s * s).mla(u, s); } u = ldexp2k(u, q); @@ -1476,23 +1476,23 @@ macro_rules! impl_math_f64_u10 { let mut u = (d * LOG10_2).round(); let q = u.roundi(); - let s = u.mul_add(-L10_U, d); - let s = u.mul_add(-L10_L, s); + let s = u.mla(-L10_U, d); + let s = u.mla(-L10_L, s); u = F64x::splat(0.241_146_349_833_426_765_2_e-3) - .mul_add(s, F64x::splat(0.115_748_841_521_718_737_5_e-2)) - .mul_add(s, F64x::splat(0.501_397_554_678_973_365_9_e-2)) - .mul_add(s, F64x::splat(0.195_976_232_072_053_308_e-1)) - .mul_add(s, F64x::splat(0.680_893_639_944_678_413_8_e-1)) - .mul_add(s, F64x::splat(0.206_995_849_472_267_623_4)) - .mul_add(s, F64x::splat(0.539_382_929_205_853_622_9)) - .mul_add(s, F64x::splat(0.117_125_514_890_854_165_5_e+1)) - .mul_add(s, F64x::splat(0.203_467_859_229_343_295_3_e+1)) - .mul_add(s, F64x::splat(0.265_094_905_523_920_587_6_e+1)) - .mul_add(s, F64x::splat(0.230_258_509_299_404_590_1_e+1)); + .mla(s, F64x::splat(0.115_748_841_521_718_737_5_e-2)) + .mla(s, F64x::splat(0.501_397_554_678_973_365_9_e-2)) + .mla(s, F64x::splat(0.195_976_232_072_053_308_e-1)) + .mla(s, F64x::splat(0.680_893_639_944_678_413_8_e-1)) + .mla(s, F64x::splat(0.206_995_849_472_267_623_4)) + .mla(s, F64x::splat(0.539_382_929_205_853_622_9)) + .mla(s, F64x::splat(0.117_125_514_890_854_165_5_e+1)) + .mla(s, F64x::splat(0.203_467_859_229_343_295_3_e+1)) + .mla(s, F64x::splat(0.265_094_905_523_920_587_6_e+1)) + .mla(s, F64x::splat(0.230_258_509_299_404_590_1_e+1)); if cfg!(target_feature = "fma") { - u = u.mul_add(s, ONE); + u = u.mla(s, ONE); } else { u = ONE.add_checked(u.mul_as_doubled(s)).normalize().0; } @@ -1566,10 +1566,10 @@ macro_rules! impl_math_f64_u10 { 0.961_812_910_759_760_053_6_e-2, 0.555_041_086_648_204_659_6_e-1, 0.240_226_506_959_101_221_4) - .mul_add(s, F64x::splat(0.693_147_180_559_945_286_2)); + .mla(s, F64x::splat(0.693_147_180_559_945_286_2)); if cfg!(target_feature = "fma") { - u = u.mul_add(s, ONE); + u = u.mla(s, ONE); } else { u = ONE.add_checked(u.mul_as_doubled(s)).normalize().0; } @@ -1779,11 +1779,11 @@ macro_rules! impl_math_f64_u10 { d = d.abs(); let mut x = F64x::splat(-0.640_245_898_480_692_909_870_982) - .mul_add(d, F64x::splat(2.961_551_030_200_395_118_185_95)) - .mul_add(d, F64x::splat(-5.733_530_609_229_478_436_361_66)) - .mul_add(d, F64x::splat(6.039_903_689_894_587_479_614_07)) - .mul_add(d, F64x::splat(-3.858_419_355_104_449_888_216_32)) - .mul_add(d, F64x::splat(2.230_727_530_249_660_972_572_2)); + .mla(d, F64x::splat(2.961_551_030_200_395_118_185_95)) + .mla(d, F64x::splat(-5.733_530_609_229_478_436_361_66)) + .mla(d, F64x::splat(6.039_903_689_894_587_479_614_07)) + .mla(d, F64x::splat(-3.858_419_355_104_449_888_216_32)) + .mla(d, F64x::splat(2.230_727_530_249_660_972_572_2)); let mut y = x * x; y = y * y; @@ -1858,7 +1858,7 @@ macro_rules! impl_math_f64_u10 { 0.294_791_677_282_761_419_6_e+2, 0.707_481_600_086_460_927_9_e-7, ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1868,7 +1868,7 @@ macro_rules! impl_math_f64_u10 { 0.400_924_433_300_873_044_3_e-6, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1878,7 +1878,7 @@ macro_rules! impl_math_f64_u10 { 0.104_011_464_162_824_694_6_e-5, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1888,7 +1888,7 @@ macro_rules! impl_math_f64_u10 { 0.150_834_915_073_332_916_7_e-5, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1898,7 +1898,7 @@ macro_rules! impl_math_f64_u10 { 0.128_814_307_493_390_102_e-5, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1908,7 +1908,7 @@ macro_rules! impl_math_f64_u10 { 0.474_416_774_988_499_393_7_e-6, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1918,7 +1918,7 @@ macro_rules! impl_math_f64_u10 { -0.655_481_630_654_248_990_2_e-7, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1928,7 +1928,7 @@ macro_rules! impl_math_f64_u10 { -0.318_925_247_145_259_984_4_e-6, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1938,7 +1938,7 @@ macro_rules! impl_math_f64_u10 { 0.135_888_382_147_035_537_7_e-6, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1948,7 +1948,7 @@ macro_rules! impl_math_f64_u10 { -0.434_393_127_715_733_604_e-6, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1958,7 +1958,7 @@ macro_rules! impl_math_f64_u10 { 0.972_478_589_740_677_955_5_e-6, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1968,7 +1968,7 @@ macro_rules! impl_math_f64_u10 { -0.203_688_605_722_596_601_1_e-5, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1978,7 +1978,7 @@ macro_rules! impl_math_f64_u10 { 0.437_336_314_181_972_581_5_e-5, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1988,7 +1988,7 @@ macro_rules! impl_math_f64_u10 { -0.943_995_126_830_400_867_7_e-5, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -1998,7 +1998,7 @@ macro_rules! impl_math_f64_u10 { 0.205_072_703_037_638_980_4_e-4, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -2008,7 +2008,7 @@ macro_rules! impl_math_f64_u10 { -0.449_262_018_343_118_401_8_e-4, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -2018,7 +2018,7 @@ macro_rules! impl_math_f64_u10 { 0.994_575_123_607_187_593_1_e-4, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -2028,7 +2028,7 @@ macro_rules! impl_math_f64_u10 { -0.223_154_759_903_498_319_6_e-3, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -2038,7 +2038,7 @@ macro_rules! impl_math_f64_u10 { 0.509_669_524_710_196_762_2_e-3, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -2048,7 +2048,7 @@ macro_rules! impl_math_f64_u10 { -0.119_275_391_166_788_697_1_e-2, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -2058,7 +2058,7 @@ macro_rules! impl_math_f64_u10 { 0.289_051_033_074_221_031_e-2, ), ) - .mul_add( + .mla( t, F64x::select3( o2, @@ -2068,7 +2068,7 @@ macro_rules! impl_math_f64_u10 { -0.738_555_102_867_446_185_8_e-2, ), ) - .mul_add( + .mla( t, F64x::select3( o2, diff --git a/src/f64x/u15_impl.rs b/src/f64x/u15_impl.rs index 61a612f..7c15c65 100644 --- a/src/f64x/u15_impl.rs +++ b/src/f64x/u15_impl.rs @@ -29,7 +29,7 @@ macro_rules! impl_math_f64_u15 { -0.575_781_953_642_071_044_9_e+2, 0.233_424_972_963_870_131_9_e+5, ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -41,7 +41,7 @@ macro_rules! impl_math_f64_u15 { -0.469_566_104_493_310_776_9_e+5, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -53,7 +53,7 @@ macro_rules! impl_math_f64_u15 { 0.317_340_310_874_864_335_3_e+5, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -65,7 +65,7 @@ macro_rules! impl_math_f64_u15 { 0.324_298_278_695_957_378_7_e+4, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -77,7 +77,7 @@ macro_rules! impl_math_f64_u15 { -0.201_471_799_976_034_781_1_e+5, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -89,7 +89,7 @@ macro_rules! impl_math_f64_u15 { 0.155_400_697_096_711_828_6_e+5, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -101,7 +101,7 @@ macro_rules! impl_math_f64_u15 { -0.615_087_419_056_355_429_3_e+4, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -113,7 +113,7 @@ macro_rules! impl_math_f64_u15 { 0.124_004_776_563_481_573_2_e+4, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -125,7 +125,7 @@ macro_rules! impl_math_f64_u15 { -0.821_032_547_575_269_973_1_e+2, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -137,7 +137,7 @@ macro_rules! impl_math_f64_u15 { 0.324_244_388_083_993_087_e+2, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -149,7 +149,7 @@ macro_rules! impl_math_f64_u15 { -0.292_341_886_383_316_058_6_e+2, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -161,7 +161,7 @@ macro_rules! impl_math_f64_u15 { 0.345_746_173_281_438_307_1, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -173,7 +173,7 @@ macro_rules! impl_math_f64_u15 { 0.548_973_015_595_239_299_8_e+1, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -185,7 +185,7 @@ macro_rules! impl_math_f64_u15 { 0.155_993_413_225_129_413_4_e-2, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -197,7 +197,7 @@ macro_rules! impl_math_f64_u15 { -0.154_174_156_683_152_063_8_e+1, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -209,7 +209,7 @@ macro_rules! impl_math_f64_u15 { 0.282_315_223_055_836_418_6_e-5, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, @@ -221,7 +221,7 @@ macro_rules! impl_math_f64_u15 { 0.624_999_918_419_534_283_8, ), ) - .mul_add( + .mla( u.0, F64x::select4( o0, diff --git a/src/f64x/u35_impl.rs b/src/f64x/u35_impl.rs index 8f8c979..efe0004 100644 --- a/src/f64x/u35_impl.rs +++ b/src/f64x/u35_impl.rs @@ -13,21 +13,21 @@ macro_rules! impl_math_f64_u35 { if d.abs().simd_lt(TRIGRANGEMAX2).all() { let dql = (d * FRAC_1_PI).round(); ql = dql.roundi(); - d = dql.mul_add(-PI_A2, d); - d = dql.mul_add(-PI_B2, d); + d = dql.mla(-PI_A2, d); + d = dql.mla(-PI_B2, d); } else if d.abs().simd_lt(TRIGRANGEMAX).all() { let dqh = (d * (FRAC_1_PI / D1_24X)).trunc(); let dqh = dqh * D1_24X; let dql = d.mul_sub(FRAC_1_PI, dqh).round(); ql = dql.roundi(); - d = dqh.mul_add(-PI_A, d); - d = dql.mul_add(-PI_A, d); - d = dqh.mul_add(-PI_B, d); - d = dql.mul_add(-PI_B, d); - d = dqh.mul_add(-PI_C, d); - d = dql.mul_add(-PI_C, d); - d = (dqh + dql).mul_add(-PI_D, d); + d = dqh.mla(-PI_A, d); + d = dql.mla(-PI_A, d); + d = dqh.mla(-PI_B, d); + d = dql.mla(-PI_B, d); + d = dqh.mla(-PI_C, d); + d = dql.mla(-PI_C, d); + d = (dqh + dql).mla(-PI_D, d); } else { let (mut ddidd, ddii) = rempi(d); ql = ddii & Ix::splat(3); @@ -64,7 +64,7 @@ macro_rules! impl_math_f64_u35 { 2.755_731_922_391_987_476_304_16_e-6, -0.000_198_412_698_412_696_162_806_809, 0.008_333_333_333_333_329_748_238_15) - .mul_add(s, F64x::splat(-0.166_666_666_666_666_657_414_808)); + .mla(s, F64x::splat(-0.166_666_666_666_666_657_414_808)); u = s * (u * d) + d; @@ -85,8 +85,8 @@ macro_rules! impl_math_f64_u35 { let dql = (d * FRAC_1_PI).round(); let mut ql = dql.roundi(); - d = dql.mul_add(-PI_A2, d); - d = dql.mul_add(-PI_B2, d); + d = dql.mla(-PI_A2, d); + d = dql.mla(-PI_B2, d); let g = r.abs().simd_lt(TRIGRANGEMAX2); if !g.all() { @@ -94,13 +94,13 @@ macro_rules! impl_math_f64_u35 { dqh *= D1_24X; let dql = r.mul_sub(FRAC_1_PI, dqh).round(); - let mut u = dqh.mul_add(-PI_A, r); - u = dql.mul_add(-PI_A, u); - u = dqh.mul_add(-PI_B, u); - u = dql.mul_add(-PI_B, u); - u = dqh.mul_add(-PI_C, u); - u = dql.mul_add(-PI_C, u); - u = (dqh + dql).mul_add(-PI_D, u); + let mut u = dqh.mla(-PI_A, r); + u = dql.mla(-PI_A, u); + u = dqh.mla(-PI_B, u); + u = dql.mla(-PI_B, u); + u = dqh.mla(-PI_C, u); + u = dql.mla(-PI_C, u); + u = (dqh + dql).mla(-PI_D, u); ql = g.cast().select(ql, dql.roundi()); d = g.select(d, u); @@ -145,7 +145,7 @@ macro_rules! impl_math_f64_u35 { 2.755_731_922_391_987_476_304_16_e-6, -0.000_198_412_698_412_696_162_806_809, 0.008_333_333_333_333_329_748_238_15) - .mul_add(s, F64x::splat(-0.166_666_666_666_666_657_414_808)); + .mla(s, F64x::splat(-0.166_666_666_666_666_657_414_808)); u = s * (u * d) + d; @@ -178,26 +178,26 @@ macro_rules! impl_math_f64_u35 { if d.abs().simd_lt(TRIGRANGEMAX2).all() { let dql = - F64x::splat(2.).mul_add(d.mul_add(FRAC_1_PI, F64x::splat(-0.5)).round(), ONE); + F64x::splat(2.).mla(d.mla(FRAC_1_PI, F64x::splat(-0.5)).round(), ONE); ql = dql.roundi(); - d = dql.mul_add(-PI_A2 * HALF, d); - d = dql.mul_add(-PI_B2 * HALF, d); + d = dql.mla(-PI_A2 * HALF, d); + d = dql.mla(-PI_B2 * HALF, d); } else if d.abs().simd_lt(TRIGRANGEMAX).all() { let dqh = d - .mul_add(FRAC_1_PI / D1_23X, -FRAC_1_PI / D1_24X) + .mla(FRAC_1_PI / D1_23X, -FRAC_1_PI / D1_24X) .trunc(); - ql = (d * FRAC_1_PI + dqh.mul_add(-D1_23X, F64x::splat(-0.5))).roundi(); + ql = (d * FRAC_1_PI + dqh.mla(-D1_23X, F64x::splat(-0.5))).roundi(); let dqh = dqh * D1_24X; ql = ql + ql + Ix::splat(1); let dql: F64x = ql.cast(); - d = dqh.mul_add(-PI_A * HALF, d); - d = dql.mul_add(-PI_A * HALF, d); - d = dqh.mul_add(-PI_B * HALF, d); - d = dql.mul_add(-PI_B * HALF, d); - d = dqh.mul_add(-PI_C * HALF, d); - d = dql.mul_add(-PI_C * HALF, d); - d = (dqh + dql).mul_add(-PI_D * HALF, d); + d = dqh.mla(-PI_A * HALF, d); + d = dql.mla(-PI_A * HALF, d); + d = dqh.mla(-PI_B * HALF, d); + d = dql.mla(-PI_B * HALF, d); + d = dqh.mla(-PI_C * HALF, d); + d = dql.mla(-PI_C * HALF, d); + d = (dqh + dql).mla(-PI_D * HALF, d); } else { let (mut ddidd, ddii) = rempi(d); ql = ddii & Ix::splat(3); @@ -235,7 +235,7 @@ macro_rules! impl_math_f64_u35 { 2.755_731_922_391_987_476_304_16_e-6, -0.000_198_412_698_412_696_162_806_809, 0.008_333_333_333_333_329_748_238_15) - .mul_add(s, F64x::splat(-0.166_666_666_666_666_657_414_808)); + .mla(s, F64x::splat(-0.166_666_666_666_666_657_414_808)); s * (u * d) + d } @@ -250,25 +250,25 @@ macro_rules! impl_math_f64_u35 { let r = d; let g = d.abs().simd_lt(TRIGRANGEMAX2); - let dql = F64x::splat(2.).mul_add((d.mul_add(FRAC_1_PI, F64x::splat(-0.5))).round(), ONE); + let dql = F64x::splat(2.).mla((d.mla(FRAC_1_PI, F64x::splat(-0.5))).round(), ONE); let mut ql = dql.roundi(); - d = dql.mul_add(-PI_A2 * HALF, d); - d = dql.mul_add(-PI_B2 * HALF, d); + d = dql.mla(-PI_A2 * HALF, d); + d = dql.mla(-PI_B2 * HALF, d); if !g.all() { - let mut dqh = (r.mul_add(FRAC_1_PI / D1_23X, -FRAC_1_PI / D1_24X)).trunc(); - let mut ql2 = (r * FRAC_1_PI + dqh.mul_add(-D1_23X, F64x::splat(-0.5))).roundi(); + let mut dqh = (r.mla(FRAC_1_PI / D1_23X, -FRAC_1_PI / D1_24X)).trunc(); + let mut ql2 = (r * FRAC_1_PI + dqh.mla(-D1_23X, F64x::splat(-0.5))).roundi(); dqh *= D1_24X; ql2 = ql2 + ql2 + Ix::splat(1); let dql: F64x = ql2.cast(); - let mut u = dqh.mul_add(-PI_A * HALF, r); - u = dql.mul_add(-PI_A * HALF, u); - u = dqh.mul_add(-PI_B * HALF, u); - u = dql.mul_add(-PI_B * HALF, u); - u = dqh.mul_add(-PI_C * HALF, u); - u = dql.mul_add(-PI_C * HALF, u); - u = (dqh + dql).mul_add(-PI_D * HALF, u); + let mut u = dqh.mla(-PI_A * HALF, r); + u = dql.mla(-PI_A * HALF, u); + u = dqh.mla(-PI_B * HALF, u); + u = dql.mla(-PI_B * HALF, u); + u = dqh.mla(-PI_C * HALF, u); + u = dql.mla(-PI_C * HALF, u); + u = (dqh + dql).mla(-PI_D * HALF, u); ql = g.cast().select(ql, ql2); d = g.select(d, u); @@ -314,7 +314,7 @@ macro_rules! impl_math_f64_u35 { 2.755_731_922_391_987_476_304_16_e-6, -0.000_198_412_698_412_696_162_806_809, 0.008_333_333_333_333_329_748_238_15) - .mul_add(s, F64x::splat(-0.166_666_666_666_666_657_414_808)); + .mla(s, F64x::splat(-0.166_666_666_666_666_657_414_808)); s * (u * d) + d } @@ -348,21 +348,21 @@ macro_rules! impl_math_f64_u35 { if d.abs().simd_lt(TRIGRANGEMAX2).all() { let dql = (d * FRAC_2_PI).round(); ql = dql.roundi(); - s = dql.mul_add(-PI_A2 * HALF, d); - s = dql.mul_add(-PI_B2 * HALF, s); + s = dql.mla(-PI_A2 * HALF, d); + s = dql.mla(-PI_B2 * HALF, s); } else if d.abs().simd_lt(TRIGRANGEMAX).all() { let dqh = (d * (FRAC_2_PI / D1_24X)).trunc(); let dqh = dqh * D1_24X; let dql = (d * FRAC_2_PI - dqh).round(); ql = dql.roundi(); - s = dqh.mul_add(-PI_A * HALF, d); - s = dql.mul_add(-PI_A * HALF, s); - s = dqh.mul_add(-PI_B * HALF, s); - s = dql.mul_add(-PI_B * HALF, s); - s = dqh.mul_add(-PI_C * HALF, s); - s = dql.mul_add(-PI_C * HALF, s); - s = (dqh + dql).mul_add(-PI_D * HALF, s); + s = dqh.mla(-PI_A * HALF, d); + s = dql.mla(-PI_A * HALF, s); + s = dqh.mla(-PI_B * HALF, s); + s = dql.mla(-PI_B * HALF, s); + s = dqh.mla(-PI_C * HALF, s); + s = dql.mla(-PI_C * HALF, s); + s = (dqh + dql).mla(-PI_D * HALF, s); } else { let (ddidd, ddii) = rempi(d); ql = ddii; @@ -375,24 +375,24 @@ macro_rules! impl_math_f64_u35 { s = s * s; let u = F64x::splat(1.589_383_072_832_289_373_285_11_e-10) - .mul_add(s, F64x::splat(-2.505_069_435_025_397_733_493_18_e-8)) - .mul_add(s, F64x::splat(2.755_731_317_768_463_605_125_47_e-6)) - .mul_add(s, F64x::splat(-0.000_198_412_698_278_911_770_864_914)) - .mul_add(s, F64x::splat(0.008_333_333_333_319_184_596_174_6)) - .mul_add(s, F64x::splat(-0.166_666_666_666_666_130_709_393)); + .mla(s, F64x::splat(-2.505_069_435_025_397_733_493_18_e-8)) + .mla(s, F64x::splat(2.755_731_317_768_463_605_125_47_e-6)) + .mla(s, F64x::splat(-0.000_198_412_698_278_911_770_864_914)) + .mla(s, F64x::splat(0.008_333_333_333_319_184_596_174_6)) + .mla(s, F64x::splat(-0.166_666_666_666_666_130_709_393)); - let rx = (u * s).mul_add(t, t); + let rx = (u * s).mla(t, t); let rx = d.is_neg_zero().select(NEG_ZERO, rx); let u = F64x::splat(-1.136_153_502_390_974_295_315_23_e-11) - .mul_add(s, F64x::splat(2.087_574_712_070_400_554_793_66_e-9)) - .mul_add(s, F64x::splat(-2.755_731_440_288_475_674_985_67_e-7)) - .mul_add(s, F64x::splat(2.480_158_728_900_018_673_119_15_e-5)) - .mul_add(s, F64x::splat(-0.001_388_888_888_887_140_192_823_29)) - .mul_add(s, F64x::splat(0.041_666_666_666_666_551_959_206_2)) - .mul_add(s, F64x::splat(-0.5)); + .mla(s, F64x::splat(2.087_574_712_070_400_554_793_66_e-9)) + .mla(s, F64x::splat(-2.755_731_440_288_475_674_985_67_e-7)) + .mla(s, F64x::splat(2.480_158_728_900_018_673_119_15_e-5)) + .mla(s, F64x::splat(-0.001_388_888_888_887_140_192_823_29)) + .mla(s, F64x::splat(0.041_666_666_666_666_551_959_206_2)) + .mla(s, F64x::splat(-0.5)); - let ry = s.mul_add(u, ONE); + let ry = s.mla(u, ONE); let o = (ql & Ix::splat(1)).simd_eq(Ix::splat(0)).cast(); let mut rsin = o.select(rx, ry); @@ -422,8 +422,8 @@ macro_rules! impl_math_f64_u35 { let dql = (s * FRAC_2_PI).round(); let mut ql = dql.roundi(); - s = dql.mul_add(-PI_A2 * HALF, s); - s = dql.mul_add(-PI_B2 * HALF, s); + s = dql.mla(-PI_A2 * HALF, s); + s = dql.mla(-PI_B2 * HALF, s); let g = d.abs().simd_lt(TRIGRANGEMAX2); if !g.all() { @@ -431,13 +431,13 @@ macro_rules! impl_math_f64_u35 { dqh *= D1_24X; let dql = (d * FRAC_2_PI - dqh).round(); - let mut u = dqh.mul_add(-PI_A * HALF, d); - u = dql.mul_add(-PI_A * HALF, u); - u = dqh.mul_add(-PI_B * HALF, u); - u = dql.mul_add(-PI_B * HALF, u); - u = dqh.mul_add(-PI_C * HALF, u); - u = dql.mul_add(-PI_C * HALF, u); - u = (dqh + dql).mul_add(-PI_D * HALF, u); + let mut u = dqh.mla(-PI_A * HALF, d); + u = dql.mla(-PI_A * HALF, u); + u = dqh.mla(-PI_B * HALF, u); + u = dql.mla(-PI_B * HALF, u); + u = dqh.mla(-PI_C * HALF, u); + u = dql.mla(-PI_C * HALF, u); + u = (dqh + dql).mla(-PI_D * HALF, u); ql = g.cast().select(ql, dql.roundi()); s = g.select(s, u); @@ -458,24 +458,24 @@ macro_rules! impl_math_f64_u35 { s = s * s; let u = F64x::splat(1.589_383_072_832_289_373_285_11_e-10) - .mul_add(s, F64x::splat(-2.505_069_435_025_397_733_493_18_e-8)) - .mul_add(s, F64x::splat(2.755_731_317_768_463_605_125_47_e-6)) - .mul_add(s, F64x::splat(-0.000_198_412_698_278_911_770_864_914)) - .mul_add(s, F64x::splat(0.008_333_333_333_319_184_596_174_6)) - .mul_add(s, F64x::splat(-0.166_666_666_666_666_130_709_393)); + .mla(s, F64x::splat(-2.505_069_435_025_397_733_493_18_e-8)) + .mla(s, F64x::splat(2.755_731_317_768_463_605_125_47_e-6)) + .mla(s, F64x::splat(-0.000_198_412_698_278_911_770_864_914)) + .mla(s, F64x::splat(0.008_333_333_333_319_184_596_174_6)) + .mla(s, F64x::splat(-0.166_666_666_666_666_130_709_393)); - let mut rx = (u * s).mul_add(t, t); + let mut rx = (u * s).mla(t, t); rx = d.is_neg_zero().select(NEG_ZERO, rx); let u = F64x::splat(-1.136_153_502_390_974_295_315_23_e-11) - .mul_add(s, F64x::splat(2.087_574_712_070_400_554_793_66_e-9)) - .mul_add(s, F64x::splat(-2.755_731_440_288_475_674_985_67_e-7)) - .mul_add(s, F64x::splat(2.480_158_728_900_018_673_119_15_e-5)) - .mul_add(s, F64x::splat(-0.001_388_888_888_887_140_192_823_29)) - .mul_add(s, F64x::splat(0.041_666_666_666_666_551_959_206_2)) - .mul_add(s, F64x::splat(-0.5)); + .mla(s, F64x::splat(2.087_574_712_070_400_554_793_66_e-9)) + .mla(s, F64x::splat(-2.755_731_440_288_475_674_985_67_e-7)) + .mla(s, F64x::splat(2.480_158_728_900_018_673_119_15_e-5)) + .mla(s, F64x::splat(-0.001_388_888_888_887_140_192_823_29)) + .mla(s, F64x::splat(0.041_666_666_666_666_551_959_206_2)) + .mla(s, F64x::splat(-0.5)); - let ry = s.mul_add(u, ONE); + let ry = s.mla(u, ONE); let o = (ql & Ix::splat(1)).simd_eq(Ix::splat(0)).cast(); let mut rsin = o.select(rx, ry); @@ -526,21 +526,21 @@ macro_rules! impl_math_f64_u35 { if d.abs().simd_lt(TRIGRANGEMAX2).all() { let dql = (d * FRAC_2_PI).round(); ql = dql.roundi(); - x = dql.mul_add(-PI_A2 * HALF, d); - x = dql.mul_add(-PI_B2 * HALF, x); + x = dql.mla(-PI_A2 * HALF, d); + x = dql.mla(-PI_B2 * HALF, x); } else if d.abs().simd_lt(F64x::splat(1e+6)).all() { let dqh = (d * (FRAC_2_PI / D1_24X)).trunc(); let dqh = dqh * D1_24X; let dql = (d * FRAC_2_PI - dqh).round(); ql = dql.roundi(); - x = dqh.mul_add(-PI_A * HALF, d); - x = dql.mul_add(-PI_A * HALF, x); - x = dqh.mul_add(-PI_B * HALF, x); - x = dql.mul_add(-PI_B * HALF, x); - x = dqh.mul_add(-PI_C * HALF, x); - x = dql.mul_add(-PI_C * HALF, x); - x = (dqh + dql).mul_add(-PI_D * HALF, x); + x = dqh.mla(-PI_A * HALF, d); + x = dql.mla(-PI_A * HALF, x); + x = dqh.mla(-PI_B * HALF, x); + x = dql.mla(-PI_B * HALF, x); + x = dqh.mla(-PI_C * HALF, x); + x = dql.mla(-PI_C * HALF, x); + x = (dqh + dql).mla(-PI_D * HALF, x); } else { let (ddidd, ddii) = rempi(d); ql = ddii; @@ -564,10 +564,10 @@ macro_rules! impl_math_f64_u35 { 0.218_694_872_818_553_549_8_e-1, 0.539_682_539_951_727_297_e-1, 0.133_333_333_333_050_058_1) - .mul_add(s, F64x::splat(0.333_333_333_333_334_369_5)); - u = s.mul_add(u * x, x); + .mla(s, F64x::splat(0.333_333_333_333_334_369_5)); + u = s.mla(u * x, x); - let y = u.mul_add(u, -ONE); + let y = u.mla(u, -ONE); x = u * F64x::splat(-2.); let o = (ql & Ix::splat(1)).simd_eq(Ix::splat(1)).cast(); @@ -585,8 +585,8 @@ macro_rules! impl_math_f64_u35 { pub fn tan_deterministic(d: F64x) -> F64x { let dql = (d * FRAC_2_PI).round(); let mut ql = dql.roundi(); - let mut s = dql.mul_add(-PI_A2 * HALF, d); - s = dql.mul_add(-PI_B2 * HALF, s); + let mut s = dql.mla(-PI_A2 * HALF, d); + s = dql.mla(-PI_B2 * HALF, s); let g = d.abs().simd_lt(TRIGRANGEMAX2); if !g.all() { @@ -594,13 +594,13 @@ macro_rules! impl_math_f64_u35 { dqh *= D1_24X; let dql = (d * FRAC_2_PI - dqh).round(); - let mut u = dqh.mul_add(-PI_A * HALF, d); - u = dql.mul_add(-PI_A * HALF, u); - u = dqh.mul_add(-PI_B * HALF, u); - u = dql.mul_add(-PI_B * HALF, u); - u = dqh.mul_add(-PI_C * HALF, u); - u = dql.mul_add(-PI_C * HALF, u); - u = (dqh + dql).mul_add(-PI_D * HALF, u); + let mut u = dqh.mla(-PI_A * HALF, d); + u = dql.mla(-PI_A * HALF, u); + u = dqh.mla(-PI_B * HALF, u); + u = dql.mla(-PI_B * HALF, u); + u = dqh.mla(-PI_C * HALF, u); + u = dql.mla(-PI_C * HALF, u); + u = (dqh + dql).mla(-PI_D * HALF, u); ql = g.cast().select(ql, dql.roundi()); s = g.select(s, u); @@ -632,10 +632,10 @@ macro_rules! impl_math_f64_u35 { 0.218_694_872_818_553_549_8_e-1, 0.539_682_539_951_727_297_e-1, 0.133_333_333_333_050_058_1) - .mul_add(s, F64x::splat(0.333_333_333_333_334_369_5)); - u = s.mul_add(u * x, x); + .mla(s, F64x::splat(0.333_333_333_333_334_369_5)); + u = s.mla(u * x, x); - let y = u.mul_add(u, -ONE); + let y = u.mla(u, -ONE); let x = u * F64x::splat(-2.); let o = (ql & Ix::splat(1)).simd_eq(Ix::splat(1)).cast(); @@ -678,23 +678,23 @@ macro_rules! impl_math_f64_u35 { // let u = F64x::splat(0.688_063_889_476_606_013_6_e-11) - .mul_add(s, F64x::splat(-0.175_715_956_454_231_019_9_e-8)) - .mul_add(s, F64x::splat(0.313_361_632_725_786_731_1_e-6)) - .mul_add(s, F64x::splat(-0.365_762_041_638_848_645_2_e-4)) - .mul_add(s, F64x::splat(0.249_039_457_018_993_210_3_e-2)) - .mul_add(s, F64x::splat(-0.807_455_121_882_805_632_e-1)) - .mul_add(s, F64x::splat(0.785_398_163_397_448_279)); + .mla(s, F64x::splat(-0.175_715_956_454_231_019_9_e-8)) + .mla(s, F64x::splat(0.313_361_632_725_786_731_1_e-6)) + .mla(s, F64x::splat(-0.365_762_041_638_848_645_2_e-4)) + .mla(s, F64x::splat(0.249_039_457_018_993_210_3_e-2)) + .mla(s, F64x::splat(-0.807_455_121_882_805_632_e-1)) + .mla(s, F64x::splat(0.785_398_163_397_448_279)); let rx = u * t; let u = F64x::splat(-0.386_014_121_368_379_435_2_e-12) - .mul_add(s, F64x::splat(0.115_005_788_802_968_141_5_e-9)) - .mul_add(s, F64x::splat(-0.246_113_649_300_666_355_3_e-7)) - .mul_add(s, F64x::splat(0.359_086_044_662_351_671_3_e-5)) - .mul_add(s, F64x::splat(-0.325_991_886_926_943_594_2_e-3)) - .mul_add(s, F64x::splat(0.158_543_442_438_154_116_9_e-1)) - .mul_add(s, F64x::splat(-0.308_425_137_534_042_437_3)) - .mul_add(s, ONE); + .mla(s, F64x::splat(0.115_005_788_802_968_141_5_e-9)) + .mla(s, F64x::splat(-0.246_113_649_300_666_355_3_e-7)) + .mla(s, F64x::splat(0.359_086_044_662_351_671_3_e-5)) + .mla(s, F64x::splat(-0.325_991_886_926_943_594_2_e-3)) + .mla(s, F64x::splat(0.158_543_442_438_154_116_9_e-1)) + .mla(s, F64x::splat(-0.308_425_137_534_042_437_3)) + .mla(s, ONE); let ry = u; @@ -782,8 +782,8 @@ macro_rules! impl_math_f64_u35 { -0.333_333_333_333_311_110_369_124, ); - t = s.mul_add(t * u, s); - q.cast::().mul_add(FRAC_PI_2, t) + t = s.mla(t * u, s); + q.cast::().mla(FRAC_PI_2, t) } /// Arc tangent function of two variables @@ -849,9 +849,9 @@ macro_rules! impl_math_f64_u35 { 0.750_000_000_037_858_161_1_e-1, 0.166_666_666_666_649_754_3); - u = u.mul_add(x * x2, x); + u = u.mla(x * x2, x); - let r = o.select(u, u.mul_add(F64x::splat(-2.), FRAC_PI_2)); + let r = o.select(u, u.mla(F64x::splat(-2.), FRAC_PI_2)); r.mul_sign(d) } @@ -958,7 +958,7 @@ macro_rules! impl_math_f64_u35 { 0.199_999_999_996_591_265_594_148, -0.333_333_333_333_311_110_369_124); - t = s.mul_add(t * u, s); + t = s.mla(t * u, s); t = (q & Ix::splat(1)).simd_eq(Ix::splat(1)).cast().select(FRAC_PI_2 - t, t); t = F64x::from_bits( @@ -989,8 +989,8 @@ macro_rules! impl_math_f64_u35 { let mut u = (d * R_LN2).round(); let q = u.roundi(); - let s = u.mul_add(-L2_U, d); - let s = u.mul_add(-L2_L, s); + let s = u.mla(-L2_U, d); + let s = u.mla(-L2_L, s); let s2 = s * s; let s4 = s2 * s2; @@ -1013,7 +1013,7 @@ macro_rules! impl_math_f64_u35 { 0.166_666_666_666_666_851_703_837, ); - u = s2.mul_add(HALF, s2 * s * u) + s; + u = s2.mla(HALF, s2 * s * u) + s; q.simd_eq(Ix::splat(0)).cast().select(u, ldexp2k(u + ONE, q) - ONE) } @@ -1053,7 +1053,7 @@ macro_rules! impl_math_f64_u35 { /// or a correct value with `3.5 ULP` error bound is returned. pub fn cosh(x: F64x) -> F64x { let e = u10::exp(x.abs()); - let mut y = HALF.mul_add(e, HALF / e); + let mut y = HALF.mla(e, HALF / e); y = (x.abs().simd_gt(F64x::splat(709.)) | y.is_nan()).select(INFINITY, y); F64x::from_bits(x.is_nan().to_int().cast() | y.to_bits()) @@ -1132,15 +1132,15 @@ macro_rules! impl_math_f64_u35 { 0.666_666_666_666_777_874_006_3); /*if !cfg!(feature = "enable_avx512f") && !cfg!(feature = "enable_avx512fnofma") {*/ - x = x.mul_add(F64x::splat(2.), F64x::splat(0.693_147_180_559_945_286_226_764) * ef); - x = x3.mul_add(t, x); + x = x.mla(F64x::splat(2.), F64x::splat(0.693_147_180_559_945_286_226_764) * ef); + x = x3.mla(t, x); x = d.simd_eq(INFINITY).select(INFINITY, x); x = (d.simd_lt(ZERO) | d.is_nan()).select(NAN, x); d.simd_eq(ZERO).select(NEG_INFINITY, x) /* } else { - x = x.mul_add(F64x::splat(2.), F64x::splat(0.693_147_180_559_945_286_226_764) * ef); - x = x3.mul_add(t, x); + x = x.mla(F64x::splat(2.), F64x::splat(0.693_147_180_559_945_286_226_764) * ef); + x = x3.mla(t, x); vfixup_vd_vd_vd_vi2_i(x, d, I64x::splat((5 << (5 * 4))), 0) }*/ } @@ -1174,12 +1174,12 @@ macro_rules! impl_math_f64_u35 { let x2 = x * x; let t = F64x::splat(0.221_194_175_045_608_149) - .mul_add(x2, F64x::splat(0.220_076_869_315_227_768_9)) - .mul_add(x2, F64x::splat(0.262_370_805_748_851_465_6)) - .mul_add(x2, F64x::splat(0.320_597_747_794_449_550_2)) - .mul_add(x2, F64x::splat(0.412_198_594_548_532_470_9)) - .mul_add(x2, F64x::splat(0.577_078_016_299_705_898_2)) - .mul_add(x2, F64x::splat(0.961_796_693_926_080_914_49)); + .mla(x2, F64x::splat(0.220_076_869_315_227_768_9)) + .mla(x2, F64x::splat(0.262_370_805_748_851_465_6)) + .mla(x2, F64x::splat(0.320_597_747_794_449_550_2)) + .mla(x2, F64x::splat(0.412_198_594_548_532_470_9)) + .mla(x2, F64x::splat(0.577_078_016_299_705_898_2)) + .mla(x2, F64x::splat(0.961_796_693_926_080_914_49)); let s = //if !cfg!(feature = "enable_avx512f") && !cfg!(feature = "enable_avx512fnofma") { @@ -1188,7 +1188,7 @@ macro_rules! impl_math_f64_u35 { e.add_checked(x.mul_as_doubled(F64x::splat(2.885_390_081_777_926_774))) */ }; - let mut r = t.mul_add(x * x2, F64x::from(s)); + let mut r = t.mla(x * x2, F64x::from(s)); //if !cfg!(feature = "enable_avx512f") && !cfg!(feature = "enable_avx512fnofma") { r = d.simd_eq(INFINITY).select(INFINITY, r); @@ -1218,8 +1218,8 @@ macro_rules! impl_math_f64_u35 { let mut u = (d * LOG10_2).round(); let q = u.roundi(); - let mut s = u.mul_add(-L10_U, d); - s = u.mul_add(-L10_L, s); + let mut s = u.mla(-L10_U, d); + s = u.mla(-L10_L, s); let s2 = s * s; let s4 = s2 * s2; @@ -1242,7 +1242,7 @@ macro_rules! impl_math_f64_u35 { 0.230_258_509_299_404_590_1_e+1, ); - u = u.mul_add(s, ONE); + u = u.mla(s, ONE); u = ldexp2k(u, q); @@ -1290,9 +1290,9 @@ macro_rules! impl_math_f64_u35 { 0.555_041_086_648_204_659_6_e-1, 0.240_226_506_959_101_221_4, ); - u = u.mul_add(s, F64x::splat(0.693_147_180_559_945_286_2)); + u = u.mla(s, F64x::splat(0.693_147_180_559_945_286_2)); - u = u.mul_add(s, ONE); + u = u.mla(s, ONE); u = ldexp2k(u, q); @@ -1344,17 +1344,17 @@ macro_rules! impl_math_f64_u35 { d = d.abs(); let mut x = F64x::splat(-0.640_245_898_480_692_909_870_982) - .mul_add(d, F64x::splat(2.961_551_030_200_395_118_185_95)) - .mul_add(d, F64x::splat(-5.733_530_609_229_478_436_361_66)) - .mul_add(d, F64x::splat(6.039_903_689_894_587_479_614_07)) - .mul_add(d, F64x::splat(-3.858_419_355_104_449_888_216_32)) - .mul_add(d, F64x::splat(2.230_727_530_249_660_972_572_2)); + .mla(d, F64x::splat(2.961_551_030_200_395_118_185_95)) + .mla(d, F64x::splat(-5.733_530_609_229_478_436_361_66)) + .mla(d, F64x::splat(6.039_903_689_894_587_479_614_07)) + .mla(d, F64x::splat(-3.858_419_355_104_449_888_216_32)) + .mla(d, F64x::splat(2.230_727_530_249_660_972_572_2)); let mut y = x * x; y = y * y; x -= d.mul_sub(y, x) * F64x::splat(1. / 3.); y = d * x * x; - y = (y - F64x::splat(2. / 3.) * y * y.mul_add(x, F64x::splat(-1.))) * q; + y = (y - F64x::splat(2. / 3.) * y * y.mla(x, F64x::splat(-1.))) * q; /*if cfg!(feature = "enable_avx512f") || cfg!(feature = "enable_avx512fnofma") { y = s.is_infinite().select(INFINITY.mul_sign(s), y); @@ -1386,7 +1386,7 @@ macro_rules! impl_math_f64_u35 { let max = x.simd_max(y); let t = min / max; - let mut ret = max * t.mul_add(t, ONE).sqrt(); + let mut ret = max * t.mla(t, ONE).sqrt(); ret = min.simd_eq(ZERO).select(max, ret); ret = (x.is_nan() | y.is_nan()).select(NAN, ret); (x.simd_eq(INFINITY) | y.simd_eq(INFINITY)).select(INFINITY, ret)