diff --git a/Project.toml b/Project.toml index ac39f7f..91919b7 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "SLEEFPirates" uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa" authors = ["chriselrod "] -version = "0.5.3" +version = "0.5.4" [deps] Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" diff --git a/src/sleef.jl b/src/sleef.jl index e23a4bb..0d2cfc5 100644 --- a/src/sleef.jl +++ b/src/sleef.jl @@ -1,6 +1,6 @@ @inline function log(v::Vec{8,Float64}) Base.llvmcall((""" -declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) +declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>) declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32) declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32) @@ -16,16 +16,16 @@ declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 %10 = fmul <8 x double> %9, %9 %11 = fmul <8 x double> %10, %10 %12 = fmul <8 x double> %8, %9 - %13 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %9, <8 x double> , <8 x double> ) #13 - %14 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> , <8 x double> %13) #13 - %15 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %9, <8 x double> , <8 x double> ) #13 - %16 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %9, <8 x double> , <8 x double> ) #13 - %17 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> %15, <8 x double> %16) #13 - %18 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %11, <8 x double> %14, <8 x double> %17) #13 + %13 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %9, <8 x double> , <8 x double> ) #13 + %14 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %10, <8 x double> , <8 x double> %13) #13 + %15 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %9, <8 x double> , <8 x double> ) #13 + %16 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %9, <8 x double> , <8 x double> ) #13 + %17 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %10, <8 x double> %15, <8 x double> %16) #13 + %18 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %11, <8 x double> %14, <8 x double> %17) #13 %19 = fmul <8 x double> %3, %20 = select <8 x i1> %4, <8 x double> , <8 x double> %19 - %21 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %8, <8 x double> , <8 x double> %20) #13 - %22 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %12, <8 x double> %18, <8 x double> %21) #13 + %21 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %8, <8 x double> , <8 x double> %20) #13 + %22 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %12, <8 x double> %18, <8 x double> %21) #13 %23 = tail call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %22, <8 x double> %0, <8 x i64> , i32 0, i8 -1, i32 4) ret <8 x double> %23 """), Vec{8,Float64}, Tuple{Vec{8,Float64}}, v) @@ -35,7 +35,7 @@ end # Support different LLVM versions. Only difference is fneg in llvm 8+ @inline function log2(v::Vec{8,Float64}) Base.llvmcall((""" - declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) + declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>) declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32) declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32) @@ -49,22 +49,22 @@ end %8 = fadd <8 x double> %6, %9 = fdiv <8 x double> %7, %8 %10 = fmul <8 x double> %9, %9 - %11 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> , <8 x double> ) #13 - %12 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %11, <8 x double> %10, <8 x double> ) #13 - %13 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %12, <8 x double> %10, <8 x double> ) #13 - %14 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %13, <8 x double> %10, <8 x double> ) #13 - %15 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %14, <8 x double> %10, <8 x double> ) #13 - %16 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %15, <8 x double> %10, <8 x double> ) #13 + %11 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %10, <8 x double> , <8 x double> ) #13 + %12 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %11, <8 x double> %10, <8 x double> ) #13 + %13 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %12, <8 x double> %10, <8 x double> ) #13 + %14 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %13, <8 x double> %10, <8 x double> ) #13 + %15 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %14, <8 x double> %10, <8 x double> ) #13 + %16 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %15, <8 x double> %10, <8 x double> ) #13 %17 = fmul <8 x double> %9, %18 = fneg <8 x double> %17 - %19 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %9, <8 x double> , <8 x double> %18) #13 + %19 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %9, <8 x double> , <8 x double> %18) #13 %20 = fadd <8 x double> %5, %17 %21 = fsub <8 x double> %5, %20 %22 = fadd <8 x double> %17, %21 %23 = fadd <8 x double> %19, %22 %24 = fmul <8 x double> %9, %10 %25 = fadd <8 x double> %20, %23 - %26 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %16, <8 x double> %24, <8 x double> %25) #13 + %26 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %16, <8 x double> %24, <8 x double> %25) #13 %27 = tail call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %26, <8 x double> %0, <8 x i64> , i32 0, i8 -1, i32 4) ret <8 x double> %27 """), Vec{8,Float64}, Tuple{Vec{8,Float64}}, v) @@ -72,7 +72,7 @@ end else @inline function log2(v::Vec{8,Float64}) Base.llvmcall((""" - declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) + declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>) declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32) declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32) @@ -86,22 +86,22 @@ else %8 = fadd <8 x double> %6, %9 = fdiv <8 x double> %7, %8 %10 = fmul <8 x double> %9, %9 - %11 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> , <8 x double> ) #13 - %12 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %11, <8 x double> %10, <8 x double> ) #13 - %13 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %12, <8 x double> %10, <8 x double> ) #13 - %14 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %13, <8 x double> %10, <8 x double> ) #13 - %15 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %14, <8 x double> %10, <8 x double> ) #13 - %16 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %15, <8 x double> %10, <8 x double> ) #13 + %11 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %10, <8 x double> , <8 x double> ) #13 + %12 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %11, <8 x double> %10, <8 x double> ) #13 + %13 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %12, <8 x double> %10, <8 x double> ) #13 + %14 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %13, <8 x double> %10, <8 x double> ) #13 + %15 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %14, <8 x double> %10, <8 x double> ) #13 + %16 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %15, <8 x double> %10, <8 x double> ) #13 %17 = fmul <8 x double> %9, %18 = fsub fast <8 x double> , %17 - %19 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %9, <8 x double> , <8 x double> %18) #13 + %19 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %9, <8 x double> , <8 x double> %18) #13 %20 = fadd <8 x double> %5, %17 %21 = fsub <8 x double> %5, %20 %22 = fadd <8 x double> %17, %21 %23 = fadd <8 x double> %19, %22 %24 = fmul <8 x double> %9, %10 %25 = fadd <8 x double> %20, %23 - %26 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %16, <8 x double> %24, <8 x double> %25) #13 + %26 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %16, <8 x double> %24, <8 x double> %25) #13 %27 = tail call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %26, <8 x double> %0, <8 x i64> , i32 0, i8 -1, i32 4) ret <8 x double> %27 """), Vec{8,Float64}, Tuple{Vec{8,Float64}}, v) @@ -119,7 +119,7 @@ end Base.llvmcall((""" declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32) declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double>, <8 x i64>, i8, i32) -declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) +declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>) """,""" %2 = tail call <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double> %0, <8 x i64> zeroinitializer, i8 -1, i32 8) #13 %3 = tail call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %0, i32 8, <8 x double> zeroinitializer, i8 -1, i32 4) @@ -127,17 +127,17 @@ declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) %5 = fmul <8 x double> %4, %4 %6 = fmul <8 x double> %5, %5 %7 = fmul <8 x double> %6, %6 - %8 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %4, <8 x double> , <8 x double> ) #13 - %9 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %4, <8 x double> , <8 x double> ) #13 - %10 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %4, <8 x double> , <8 x double> ) #13 - %11 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %5, <8 x double> %9, <8 x double> %10) #13 - %12 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %4, <8 x double> , <8 x double> ) #13 - %13 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %4, <8 x double> , <8 x double> ) #13 - %14 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %5, <8 x double> %12, <8 x double> %13) #13 - %15 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %6, <8 x double> %11, <8 x double> %14) #13 - %16 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %7, <8 x double> %8, <8 x double> %15) #13 - %17 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %16, <8 x double> %4, <8 x double> ) #13 - %18 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %17, <8 x double> %4, <8 x double> ) #13 + %8 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %4, <8 x double> , <8 x double> ) #13 + %9 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %4, <8 x double> , <8 x double> ) #13 + %10 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %4, <8 x double> , <8 x double> ) #13 + %11 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %5, <8 x double> %9, <8 x double> %10) #13 + %12 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %4, <8 x double> , <8 x double> ) #13 + %13 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %4, <8 x double> , <8 x double> ) #13 + %14 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %5, <8 x double> %12, <8 x double> %13) #13 + %15 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %6, <8 x double> %11, <8 x double> %14) #13 + %16 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %7, <8 x double> %8, <8 x double> %15) #13 + %17 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %16, <8 x double> %4, <8 x double> ) #13 + %18 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %17, <8 x double> %4, <8 x double> ) #13 %19 = ashr <8 x i64> %2, %20 = add nsw <8 x i64> %19, %21 = shl <8 x i64> %20, @@ -164,28 +164,28 @@ end Base.llvmcall((""" declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32) declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double>, <8 x i64>, i8, i32) -declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) +declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>) """,""" %2 = fmul <8 x double> %0, %3 = tail call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %2, i32 8, <8 x double> zeroinitializer, i8 -1, i32 4) #13 %4 = tail call <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double> %3, <8 x i64> zeroinitializer, i8 -1, i32 8) #13 - %5 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %3, <8 x double> , <8 x double> %0) #13 - %6 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %3, <8 x double> , <8 x double> %5) #13 + %5 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %3, <8 x double> , <8 x double> %0) #13 + %6 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %3, <8 x double> , <8 x double> %5) #13 %7 = fmul <8 x double> %6, %6 %8 = fmul <8 x double> %7, %7 %9 = fmul <8 x double> %8, %8 - %10 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %6, <8 x double> , <8 x double> ) #13 - %11 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %6, <8 x double> , <8 x double> ) #13 - %12 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %6, <8 x double> , <8 x double> ) #13 - %13 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %7, <8 x double> %11, <8 x double> %12) #13 - %14 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %6, <8 x double> , <8 x double> ) #13 - %15 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %6, <8 x double> , <8 x double> ) #13 - %16 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %7, <8 x double> %14, <8 x double> %15) #13 - %17 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %8, <8 x double> %13, <8 x double> %16) #13 - %18 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %9, <8 x double> %10, <8 x double> %17) #13 - %19 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %18, <8 x double> %6, <8 x double> ) #13 - %20 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %19, <8 x double> %6, <8 x double> ) #13 - %21 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %20, <8 x double> %6, <8 x double> ) #13 + %10 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %6, <8 x double> , <8 x double> ) #13 + %11 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %6, <8 x double> , <8 x double> ) #13 + %12 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %6, <8 x double> , <8 x double> ) #13 + %13 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %7, <8 x double> %11, <8 x double> %12) #13 + %14 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %6, <8 x double> , <8 x double> ) #13 + %15 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %6, <8 x double> , <8 x double> ) #13 + %16 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %7, <8 x double> %14, <8 x double> %15) #13 + %17 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %8, <8 x double> %13, <8 x double> %16) #13 + %18 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %9, <8 x double> %10, <8 x double> %17) #13 + %19 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %18, <8 x double> %6, <8 x double> ) #13 + %20 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %19, <8 x double> %6, <8 x double> ) #13 + %21 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %20, <8 x double> %6, <8 x double> ) #13 %22 = ashr <8 x i64> %4, %23 = add nsw <8 x i64> %22, %24 = shl <8 x i64> %23, @@ -208,7 +208,7 @@ end @inline function log1p(v::Vec{8,Float64}) Base.llvmcall((""" -declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) +declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>) """,""" %2 = fadd <8 x double> %0, %3 = fcmp one <8 x double> %2, zeroinitializer @@ -237,22 +237,22 @@ declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) %26 = fdiv <8 x double> %22, %25 %27 = fmul <8 x double> %26, %26 %28 = fmul <8 x double> %27, %27 - %29 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %28, <8 x double> , <8 x double> ) #16 - %30 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %28, <8 x double> %29, <8 x double> ) #16 + %29 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %28, <8 x double> , <8 x double> ) #16 + %30 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %28, <8 x double> %29, <8 x double> ) #16 %31 = fmul <8 x double> %28, %30 - %32 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %28, <8 x double> , <8 x double> ) #16 - %33 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %28, <8 x double> %32, <8 x double> ) #16 - %34 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %28, <8 x double> %33, <8 x double> ) #16 + %32 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %28, <8 x double> , <8 x double> ) #16 + %33 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %28, <8 x double> %32, <8 x double> ) #16 + %34 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %28, <8 x double> %33, <8 x double> ) #16 %35 = fmul <8 x double> %27, %34 %36 = fadd <8 x double> %31, %35 %37 = sitofp <8 x i64> %8 to <8 x double> %38 = fadd <8 x double> %24, %36 %39 = fmul <8 x double> %37, %40 = fadd <8 x double> %15, %39 - %41 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %26, <8 x double> %38, <8 x double> %40) #16 + %41 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %26, <8 x double> %38, <8 x double> %40) #16 %42 = fsub <8 x double> %41, %24 %43 = fadd <8 x double> %22, %42 - %44 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %37, <8 x double> , <8 x double> %43) #16 + %44 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %37, <8 x double> , <8 x double> %43) #16 %45 = fcmp oeq <8 x double> %0, %46 = select <8 x i1> %45, <8 x double> , <8 x double> %44 %47 = select <8 x i1> %3, <8 x double> %46, <8 x double> @@ -268,7 +268,7 @@ end # @inline function log1p(v::Vec{8,Float64}) # Base.llvmcall((""" # declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double>, <8 x i64>, i8, i32) -# declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) +# declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>) # declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) # """,""" # %2 = fadd <8 x double> %0, @@ -282,32 +282,32 @@ end # %10 = add <8 x i64> %9, # %11 = bitcast <8 x i64> %10 to <8 x double> # %12 = fadd <8 x double> %11, -# %13 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %0, <8 x double> %11, <8 x double> %12) #13 +# %13 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %0, <8 x double> %11, <8 x double> %12) #13 # %14 = fmul <8 x double> %6, # %15 = fsub <8 x double> , %14 -# %16 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %6, <8 x double> , <8 x double> %15) #13 -# %17 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %6, <8 x double> , <8 x double> %16) #13 +# %16 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %6, <8 x double> , <8 x double> %15) #13 +# %17 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %6, <8 x double> , <8 x double> %16) #13 # %18 = fadd <8 x double> %13, # %19 = fsub <8 x double> , %18 # %20 = fadd <8 x double> %13, %19 # %21 = fdiv <8 x double> , %18 # %22 = fmul <8 x double> %13, %21 # %23 = fsub <8 x double> , %22 -# %24 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %21, <8 x double> %13, <8 x double> %23) #13 +# %24 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %21, <8 x double> %13, <8 x double> %23) #13 # %25 = fsub <8 x double> , %21 -# %26 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %18, <8 x double> %25, <8 x double> ) #13 -# %27 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %20, <8 x double> %25, <8 x double> %26) #13 -# %28 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %21, <8 x double> zeroinitializer, <8 x double> %24) #13 -# %29 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %22, <8 x double> %27, <8 x double> %28) #13 +# %26 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %18, <8 x double> %25, <8 x double> ) #13 +# %27 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %20, <8 x double> %25, <8 x double> %26) #13 +# %28 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %21, <8 x double> zeroinitializer, <8 x double> %24) #13 +# %29 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %22, <8 x double> %27, <8 x double> %28) #13 # %30 = fmul <8 x double> %22, %22 # %31 = fmul <8 x double> %30, %30 # %32 = fmul <8 x double> %31, %31 -# %33 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %30, <8 x double> , <8 x double> ) #13 -# %34 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %31, <8 x double> , <8 x double> %33) #13 -# %35 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %30, <8 x double> , <8 x double> ) #13 -# %36 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %30, <8 x double> , <8 x double> ) #13 -# %37 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %31, <8 x double> %35, <8 x double> %36) #13 -# %38 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %32, <8 x double> %34, <8 x double> %37) #13 +# %33 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %30, <8 x double> , <8 x double> ) #13 +# %34 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %31, <8 x double> , <8 x double> %33) #13 +# %35 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %30, <8 x double> , <8 x double> ) #13 +# %36 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %30, <8 x double> , <8 x double> ) #13 +# %37 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %31, <8 x double> %35, <8 x double> %36) #13 +# %38 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %32, <8 x double> %34, <8 x double> %37) #13 # %39 = fmul <8 x double> %22, # %40 = fmul <8 x double> %29, # %41 = fadd <8 x double> %14, %39 @@ -343,7 +343,7 @@ end Base.llvmcall((""" declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) #16 declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) #16 -declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float> , <8 x float>) #16 +declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float> , <8 x float>) #16 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) #16 """,""" @@ -360,12 +360,12 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) #16 11: ; preds = %1 %12 = fmul <8 x float> %4, %4 - %13 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %12, <8 x float> , <8 x float> ) #16 - %14 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %12, <8 x float> %13, <8 x float> ) #16 - %15 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %12, <8 x float> %14, <8 x float> ) #16 - %16 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %12, <8 x float> %15, <8 x float> ) #16 + %13 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %12, <8 x float> , <8 x float> ) #16 + %14 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %12, <8 x float> %13, <8 x float> ) #16 + %15 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %12, <8 x float> %14, <8 x float> ) #16 + %16 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %12, <8 x float> %15, <8 x float> ) #16 %17 = fmul <8 x float> %16, %12 - %18 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %17, <8 x float> %4, <8 x float> %4) #16 + %18 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %17, <8 x float> %4, <8 x float> %4) #16 %19 = bitcast <8 x float> %18 to <8 x i32> %20 = tail call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %7, <8 x float> ) #16 %21 = icmp eq i32 %20, 0 @@ -377,14 +377,14 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) #16 %25 = fmul <8 x float> %4, %26 = tail call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %25, i32 0) %27 = fneg <8 x float> %26 - %28 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %27, <8 x float> , <8 x float> %24) #16 - %29 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %27, <8 x float> , <8 x float> %28) #16 - %30 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %29, <8 x float> , <8 x float> ) #16 - %31 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %29, <8 x float> %30, <8 x float> ) #16 - %32 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %29, <8 x float> %31, <8 x float> ) #16 - %33 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %29, <8 x float> %32, <8 x float> ) #16 + %28 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %27, <8 x float> , <8 x float> %24) #16 + %29 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %27, <8 x float> , <8 x float> %28) #16 + %30 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %29, <8 x float> , <8 x float> ) #16 + %31 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %29, <8 x float> %30, <8 x float> ) #16 + %32 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %29, <8 x float> %31, <8 x float> ) #16 + %33 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %29, <8 x float> %32, <8 x float> ) #16 %34 = fmul <8 x float> %29, %29 - %35 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %33, <8 x float> %34, <8 x float> %29) #16 + %35 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %33, <8 x float> %34, <8 x float> %29) #16 %36 = fadd <8 x float> %35, %37 = fcmp ole <8 x float> %24, %38 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %26) #16 @@ -397,7 +397,7 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) #16 %45 = fdiv <8 x float> , %44 %46 = select <8 x i1> %37, <8 x float> , <8 x float> %45 %47 = select <8 x i1> %43, <8 x float> zeroinitializer, <8 x float> %46 - %48 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %47, <8 x float> , <8 x float> ) #16 + %48 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %47, <8 x float> , <8 x float> ) #16 %49 = bitcast <8 x float> %48 to <8 x i32> %50 = select <8 x i1> %5, <8 x i32> %23, <8 x i32> %49 br label %51 @@ -411,7 +411,7 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) #16 end @inline function tanh(v::Vec{4,Float64}) Base.llvmcall((""" - declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #16 + declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #16 declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) #16 declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) #16 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) @@ -430,14 +430,14 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) #16 11: ; preds = %1 %12 = fmul fast <4 x double> %4, %4 - %13 = tail call fast <4 x double> @llvm.fma.v4f64(<4 x double> %12, <4 x double> , <4 x double> ) #16 - %14 = tail call fast <4 x double> @llvm.fma.v4f64(<4 x double> %12, <4 x double> %13, <4 x double> ) #16 + %13 = tail call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %12, <4 x double> , <4 x double> ) #16 + %14 = tail call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %12, <4 x double> %13, <4 x double> ) #16 %15 = fmul fast <4 x double> %14, %12 %16 = fadd fast <4 x double> %12, - %17 = tail call fast <4 x double> @llvm.fma.v4f64(<4 x double> %12, <4 x double> %16, <4 x double> ) #16 - %18 = tail call fast <4 x double> @llvm.fma.v4f64(<4 x double> %12, <4 x double> %17, <4 x double> ) #16 + %17 = tail call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %12, <4 x double> %16, <4 x double> ) #16 + %18 = tail call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %12, <4 x double> %17, <4 x double> ) #16 %19 = fdiv fast <4 x double> %15, %18 - %20 = tail call fast <4 x double> @llvm.fma.v4f64(<4 x double> %19, <4 x double> %4, <4 x double> %4) #16 + %20 = tail call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %19, <4 x double> %4, <4 x double> %4) #16 %21 = tail call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %7, <4 x double> ) #16 %22 = icmp eq i32 %21, 0 br i1 %22, label %23, label %59 @@ -448,16 +448,16 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) #16 %26 = fmul fast <4 x double> %4, %27 = tail call fast <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %26, i32 0) %28 = fneg fast <4 x double> %27 - %29 = tail call fast <4 x double> @llvm.fma.v4f64(<4 x double> %28, <4 x double> , <4 x double> %25) #16 + %29 = tail call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %28, <4 x double> , <4 x double> %25) #16 %30 = fmul fast <4 x double> %27, %31 = fsub fast <4 x double> %29, %30 %32 = fmul fast <4 x double> %31, %31 - %33 = tail call fast <4 x double> @llvm.fma.v4f64(<4 x double> %32, <4 x double> , <4 x double> ) #16 - %34 = tail call fast <4 x double> @llvm.fma.v4f64(<4 x double> %32, <4 x double> %33, <4 x double> ) #16 - %35 = tail call fast <4 x double> @llvm.fma.v4f64(<4 x double> %32, <4 x double> %34, <4 x double> ) #16 - %36 = tail call fast <4 x double> @llvm.fma.v4f64(<4 x double> %32, <4 x double> %35, <4 x double> ) #16 + %33 = tail call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %32, <4 x double> , <4 x double> ) #16 + %34 = tail call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %32, <4 x double> %33, <4 x double> ) #16 + %35 = tail call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %32, <4 x double> %34, <4 x double> ) #16 + %36 = tail call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %32, <4 x double> %35, <4 x double> ) #16 %37 = fneg fast <4 x double> %32 - %38 = tail call fast <4 x double> @llvm.fma.v4f64(<4 x double> %37, <4 x double> %36, <4 x double> %31) #16 + %38 = tail call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %37, <4 x double> %36, <4 x double> %31) #16 %39 = fmul fast <4 x double> %38, %31 %40 = fsub fast <4 x double> , %38 %41 = fdiv fast <4 x double> %39, %40 @@ -476,7 +476,7 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) #16 %54 = fdiv fast <4 x double> , %53 %55 = select <4 x i1> %45, <4 x double> , <4 x double> %54 %56 = select <4 x i1> %52, <4 x double> zeroinitializer, <4 x double> %55 - %57 = tail call fast <4 x double> @llvm.fma.v4f64(<4 x double> %56, <4 x double> , <4 x double> ) #16 + %57 = tail call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %56, <4 x double> , <4 x double> ) #16 %58 = select <4 x i1> %5, <4 x double> %24, <4 x double> %57 br label %59 @@ -491,7 +491,7 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) #16 @inline function atanh(v::Vec{4,Float64}) Base.llvmcall((""" -declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #16 +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #16 """,""" %2 = bitcast <4 x double> %0 to <4 x i64> %3 = and <4 x i64> %2, @@ -502,7 +502,7 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) # %8 = select <4 x i1> %7, <4 x double> %4, <4 x double> %5 %9 = fdiv <4 x double> %8, %6 %10 = and <4 x i64> %2, - %11 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %5, <4 x double> %9, <4 x double> %5) #16 + %11 = tail call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %5, <4 x double> %9, <4 x double> %5) #16 %12 = select <4 x i1> %7, <4 x double> %11, <4 x double> %9 %13 = fadd <4 x double> %12, %14 = fcmp one <4 x double> %13, zeroinitializer @@ -524,21 +524,21 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) # %30 = fdiv <4 x double> %26, %29 %31 = fmul <4 x double> %30, %30 %32 = fmul <4 x double> %31, %31 - %33 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %32, <4 x double> , <4 x double> ) #16 - %34 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %32, <4 x double> %33, <4 x double> ) #16 + %33 = tail call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %32, <4 x double> , <4 x double> ) #16 + %34 = tail call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %32, <4 x double> %33, <4 x double> ) #16 %35 = fmul <4 x double> %34, %32 - %36 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %32, <4 x double> , <4 x double> ) #16 - %37 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %32, <4 x double> %36, <4 x double> ) #16 - %38 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %32, <4 x double> %37, <4 x double> ) #16 + %36 = tail call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %32, <4 x double> , <4 x double> ) #16 + %37 = tail call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %32, <4 x double> %36, <4 x double> ) #16 + %38 = tail call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %32, <4 x double> %37, <4 x double> ) #16 %39 = fmul <4 x double> %38, %31 %40 = sitofp <4 x i64> %19 to <4 x double> %41 = fadd <4 x double> %35, %28 %42 = fadd <4 x double> %41, %39 %43 = fmul <4 x double> %40, - %44 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %30, <4 x double> %42, <4 x double> %43) #16 + %44 = tail call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %30, <4 x double> %42, <4 x double> %43) #16 %45 = fsub <4 x double> %26, %28 %46 = fadd <4 x double> %45, %44 - %47 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %40, <4 x double> , <4 x double> %46) #16 + %47 = tail call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %40, <4 x double> , <4 x double> %46) #16 %48 = fcmp oeq <4 x double> %12, %49 = fcmp ult <4 x double> %13, zeroinitializer %50 = fmul <4 x double> %47, @@ -558,7 +558,7 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) # @static if SIMDPirates.VectorizationBase.AVX512F @inline function tanh(v::Vec{16,Float32}) Base.llvmcall((""" - declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float> ) + declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float> ) declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32) declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32) """, """ @@ -574,12 +574,12 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) # 10: ; preds = %1 %11 = fmul <16 x float> %4, %4 - %12 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %11, <16 x float> , <16 x float> ) #16 - %13 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %11, <16 x float> %12, <16 x float> ) #16 - %14 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %11, <16 x float> %13, <16 x float> ) #16 - %15 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %11, <16 x float> %14, <16 x float> ) #16 + %12 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %11, <16 x float> , <16 x float> ) #16 + %13 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %11, <16 x float> %12, <16 x float> ) #16 + %14 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %11, <16 x float> %13, <16 x float> ) #16 + %15 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %11, <16 x float> %14, <16 x float> ) #16 %16 = fmul <16 x float> %11, %15 - %17 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %16, <16 x float> %4, <16 x float> %4) #16 + %17 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %16, <16 x float> %4, <16 x float> %4) #16 %18 = bitcast <16 x float> %17 to <16 x i32> %19 = icmp eq i16 %6, -1 br i1 %19, label %48, label %20 @@ -589,14 +589,14 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) # %22 = fadd <16 x float> %4, %4 %23 = fmul <16 x float> %22, %24 = tail call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %23, i32 0, <16 x float> zeroinitializer, i16 -1, i32 4) - %25 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %24, <16 x float> , <16 x float> %22) #16 - %26 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %24, <16 x float> , <16 x float> %25) #16 - %27 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %26, <16 x float> , <16 x float> ) #16 - %28 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %26, <16 x float> %27, <16 x float> ) #16 - %29 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %26, <16 x float> %28, <16 x float> ) #16 - %30 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %26, <16 x float> %29, <16 x float> ) #16 + %25 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %24, <16 x float> , <16 x float> %22) #16 + %26 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %24, <16 x float> , <16 x float> %25) #16 + %27 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %26, <16 x float> , <16 x float> ) #16 + %28 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %26, <16 x float> %27, <16 x float> ) #16 + %29 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %26, <16 x float> %28, <16 x float> ) #16 + %30 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %26, <16 x float> %29, <16 x float> ) #16 %31 = fmul <16 x float> %26, %26 - %32 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %30, <16 x float> %31, <16 x float> %26) #16 + %32 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %30, <16 x float> %31, <16 x float> %26) #16 %33 = fadd <16 x float> %32, %34 = fcmp ole <16 x float> %22, %35 = tail call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %24, <16 x i32> zeroinitializer, i16 -1, i32 4) #16 @@ -609,7 +609,7 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) # %42 = fdiv <16 x float> , %41 %43 = select <16 x i1> %34, <16 x float> , <16 x float> %42 %44 = select <16 x i1> %40, <16 x float> zeroinitializer, <16 x float> %43 - %45 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %44, <16 x float> , <16 x float> ) #16 + %45 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %44, <16 x float> , <16 x float> ) #16 %46 = bitcast <16 x float> %45 to <16 x i32> %47 = select <16 x i1> %5, <16 x i32> %21, <16 x i32> %46 br label %48 @@ -623,7 +623,7 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) # end @inline function tanh(v::Vec{8,Float64}) Base.llvmcall((""" - declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) + declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>) declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32) declare <8 x i64> @llvm.x86.avx512.mask.cvttpd2qq.512(<8 x double>, <8 x i32>, i8, i32) #16 """, """ @@ -638,14 +638,14 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) # 9: ; preds = %1 %10 = fmul <8 x double> %4, %4 - %11 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> , <8 x double> ) #16 - %12 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> %11, <8 x double> ) #16 + %11 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %10, <8 x double> , <8 x double> ) #16 + %12 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %10, <8 x double> %11, <8 x double> ) #16 %13 = fmul <8 x double> %12, %10 %14 = fadd <8 x double> %10, - %15 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> %14, <8 x double> ) #16 - %16 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %10, <8 x double> %15, <8 x double> ) #16 + %15 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %10, <8 x double> %14, <8 x double> ) #16 + %16 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %10, <8 x double> %15, <8 x double> ) #16 %17 = fdiv <8 x double> %13, %16 - %18 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %17, <8 x double> %4, <8 x double> %4) #16 + %18 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %17, <8 x double> %4, <8 x double> %4) #16 %19 = icmp eq i8 %6, -1 br i1 %19, label %54, label %20 @@ -654,16 +654,16 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) # %22 = fmul <8 x double> %4, %23 = fmul <8 x double> %4, %24 = tail call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %23, i32 0, <8 x double> zeroinitializer, i8 -1, i32 4) - %25 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %24, <8 x double> , <8 x double> %22) #16 + %25 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %24, <8 x double> , <8 x double> %22) #16 %26 = fmul <8 x double> %24, %27 = fsub <8 x double> %25, %26 %28 = fmul <8 x double> %27, %27 - %29 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %28, <8 x double> , <8 x double> ) #16 - %30 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %28, <8 x double> %29, <8 x double> ) #16 - %31 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %28, <8 x double> %30, <8 x double> ) #16 - %32 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %28, <8 x double> %31, <8 x double> ) #16 + %29 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %28, <8 x double> , <8 x double> ) #16 + %30 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %28, <8 x double> %29, <8 x double> ) #16 + %31 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %28, <8 x double> %30, <8 x double> ) #16 + %32 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %28, <8 x double> %31, <8 x double> ) #16 %33 = fneg <8 x double> %32 - %34 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %28, <8 x double> %33, <8 x double> %27) #16 + %34 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %28, <8 x double> %33, <8 x double> %27) #16 %35 = fmul <8 x double> %34, %27 %36 = fsub <8 x double> , %34 %37 = fdiv <8 x double> %35, %36 @@ -681,7 +681,7 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) # %49 = fdiv <8 x double> , %48 %50 = select <8 x i1> %41, <8 x double> , <8 x double> %49 %51 = select <8 x i1> %47, <8 x double> zeroinitializer, <8 x double> %50 - %52 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %51, <8 x double> , <8 x double> ) #16 + %52 = tail call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %51, <8 x double> , <8 x double> ) #16 %53 = select <8 x i1> %5, <8 x double> %21, <8 x double> %52 br label %54