From 9908073242e7b35d1e8bd661e01a5981cbd0ea15 Mon Sep 17 00:00:00 2001 From: XNNPACK Team Date: Thu, 24 Oct 2024 09:54:00 -0700 Subject: [PATCH] Define xnn_float16 to be _Float16 if we detect that we are compiling in an environment that supports it fully. Add explicit casts in a handful of necessary cases. PiperOrigin-RevId: 689418297 --- bench/f16-rminmax.cc | 2 +- src/xnnpack/math.h | 57 +++++++++++++++++++++++++++++++ test/gemm-microkernel-tester.cc | 38 ++++++++++----------- test/reduce-microkernel-tester.h | 2 +- test/softmax-operator-tester.h | 4 +-- test/static-constant-pad.cc | 8 +++-- test/unary-operator-tester.h | 2 +- test/vbinary-microkernel-tester.h | 17 +++++++-- test/vcvt-microkernel-tester.cc | 6 ++-- test/vunary-microkernel-tester.h | 2 +- 10 files changed, 106 insertions(+), 32 deletions(-) diff --git a/bench/f16-rminmax.cc b/bench/f16-rminmax.cc index 32866e1df622..d6803f754448 100644 --- a/bench/f16-rminmax.cc +++ b/bench/f16-rminmax.cc @@ -43,7 +43,7 @@ static void f16_rminmax( init_params(¶ms); } - xnn_float16 output[2] = {std::nanf(""), std::nanf("")}; + xnn_float16 output[2] = {(xnn_float16)std::nanf(""), (xnn_float16)std::nanf("")}; for (auto _ : state) { rminmax(elements * sizeof(xnn_float16), input.data(), output, ¶ms); } diff --git a/src/xnnpack/math.h b/src/xnnpack/math.h index 893e038a3126..cda13f50ee66 100644 --- a/src/xnnpack/math.h +++ b/src/xnnpack/math.h @@ -426,6 +426,46 @@ XNN_INLINE static uint16_t math_cvt_bf16_fp32(float x) { } // extern "C" #endif +// We want to use _Float16 if the compiler supports it fully, but it's +// tricky to do this detection; there are compiler versions that "support" it +// as a type but without any operations on it (i.e., it just supports conversion +// to/from float32). We're only going to bother using it if the support is +// mostly complete, which generally means a recent version of Clang or GCC, +// x86 or ARM architectures, and (in some cases) the right architecture +// flags specified on the command line. + +#ifndef XNN_HAVE_FLOAT16 + +// Some non-GCC compilers define __GNUC__, but we only want to detect the Real +// Thing +#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && \ + !defined(__INTEL_LLVM_COMPILER) +#define XNN_GNUC_ACTUAL __GNUC__ +#else +#define XNN_GNUC_ACTUAL 0 +#endif + +#if (defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__) && \ + defined(__FLT16_MAX__) && \ + ((__clang_major__ >= 15 && !defined(_MSC_VER)) || (XNN_GNUC_ACTUAL >= 12)) +#define XNN_HAVE_FLOAT16 1 +#endif + +#if (defined(__aarch64__) && !defined(_MSC_VER)) && \ + ((defined(__clang__) && (__clang_major__ >= 15)) || \ + (XNN_GNUC_ACTUAL >= 13)) +#define XNN_HAVE_FLOAT16 1 +#endif + +#if defined(__riscv) && defined(__riscv_zvfh) && __clang__ >= 1600 +#define XNN_HAVE_FLOAT16 1 +#endif + +#endif // XNN_HAVE_FLOAT16 + +#ifdef XNN_HAVE_FLOAT16 +typedef _Float16 xnn_float16; +#else // We want float16s to be a distinct type from uint16_t, to avoid accidental // reinterpret casts as integers. This type is designed to produce errors when // using it as an arithmetic type in C, and designed to emulate a native float16 @@ -441,6 +481,7 @@ struct xnn_float16 { #endif }; typedef struct xnn_float16 xnn_float16; +#endif struct xnn_bfloat16 { uint16_t value; @@ -460,13 +501,21 @@ extern "C" { #endif XNN_INLINE static xnn_float16 xnn_float16_from_float(float f) { +#ifdef XNN_HAVE_FLOAT16 + return f; +#else struct xnn_float16 result; result.value = fp16_ieee_from_fp32_value(f); return result; +#endif } XNN_INLINE static float xnn_float16_to_float(xnn_float16 fp16) { +#ifdef XNN_HAVE_FLOAT16 + return (float) fp16; +#else return fp16_ieee_to_fp32_value(fp16.value); +#endif } XNN_INLINE static xnn_bfloat16 xnn_bfloat16_from_float(float f) { @@ -480,14 +529,22 @@ XNN_INLINE static float xnn_bfloat16_to_float(xnn_bfloat16 bf16) { } XNN_INLINE static xnn_float16 xnn_float16_zero() { +#ifdef XNN_HAVE_FLOAT16 + return 0.0f; +#else struct xnn_float16 result; result.value = 0; return result; +#endif } XNN_INLINE static bool xnn_float16_is_zero(xnn_float16 f) { +#ifdef XNN_HAVE_FLOAT16 + return f == 0.0f || f == -0.0f; +#else // Check for +/- zero (0x0000/0x8000). uint16 overflow is well defined to wrap around. return f.value * 2 == 0; +#endif } #ifdef __cplusplus diff --git a/test/gemm-microkernel-tester.cc b/test/gemm-microkernel-tester.cc index 2f2aebe64099..628bb09225df 100644 --- a/test/gemm-microkernel-tester.cc +++ b/test/gemm-microkernel-tester.cc @@ -233,7 +233,7 @@ void GemmMicrokernelTester::Test( const float tolerance = std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 1.0e-2f); EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], tolerance) << "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j] - << "), optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " + << "), optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k(); } } @@ -950,7 +950,7 @@ void GemmMicrokernelTester::Test( const float tolerance = std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 1.0e-2f); EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], tolerance) << "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j] - << "), optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " + << "), optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k(); } } @@ -1219,7 +1219,7 @@ void GemmMicrokernelTester::Test( EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], tolerance) << "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j] << " (accumulator = " << acc[i * n() + j] - << "), optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " + << "), optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k(); } } @@ -1381,7 +1381,7 @@ void GemmMicrokernelTester::Test( const float tolerance = std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 1.0e-3f); EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], tolerance) << "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j] - << ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " + << ", optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k2; } } @@ -2229,9 +2229,9 @@ void GemmMicrokernelTester::Test( std::fill(packed_w.begin(), packed_w.end(), 0); pack(/*g=*/1, n(), k(), nr(), kr(), sr(), - reinterpret_cast(b.data()), + reinterpret_cast(b.data()), reinterpret_cast(bias.data()), /*scale=*/nullptr, - reinterpret_cast(packed_w.data()), + reinterpret_cast(packed_w.data()), /*extra_bytes=*/0, /*params=*/nullptr); for (size_t m_index = 0; m_index < m(); m_index++) { @@ -2292,7 +2292,7 @@ void GemmMicrokernelTester::Test( xnnpack::ReplicableRandomDevice rng; auto f32rng = std::bind(std::uniform_real_distribution(), std::ref(rng)); - + xnnpack::Buffer a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); xnnpack::Buffer b(n() * k()); xnnpack::Buffer packed_w( @@ -2309,10 +2309,10 @@ void GemmMicrokernelTester::Test( std::fill(packed_w.begin(), packed_w.end(), 0); pack(/*g=*/1, n(), k(), nr(), kr(), sr(), - reinterpret_cast(b.data()), - reinterpret_cast(bias.data()), - /*scale=*/nullptr, - reinterpret_cast(packed_w.data()), + reinterpret_cast(b.data()), + reinterpret_cast(bias.data()), + /*scale=*/nullptr, + reinterpret_cast(packed_w.data()), /*extra_bytes=*/0, /*params=*/nullptr); for (size_t m_index = 0; m_index < m(); m_index++) { @@ -2355,7 +2355,7 @@ void GemmMicrokernelTester::Test( for (size_t j = 0; j < n(); j++) { EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 1.0e-2f)) << "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j] - << ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr() + << ", optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k(); } } @@ -2371,7 +2371,7 @@ void GemmMicrokernelTester::Test( xnnpack::ReplicableRandomDevice rng; auto f32rng = std::bind(std::uniform_real_distribution(), std::ref(rng)); - + xnnpack::Buffer a((mr() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); xnnpack::Buffer b(n() * ks() * k()); xnnpack::Buffer packed_w( @@ -2390,9 +2390,9 @@ void GemmMicrokernelTester::Test( std::fill(packed_w.begin(), packed_w.end(), 0); pack(/*g=*/1, n(), ks(), k(), nr(), kr(), sr(), - reinterpret_cast(b.data()), - reinterpret_cast(bias.data()), /*scale=*/nullptr, - reinterpret_cast(packed_w.data()), + reinterpret_cast(b.data()), + reinterpret_cast(bias.data()), /*scale=*/nullptr, + reinterpret_cast(packed_w.data()), /*extra_bytes=*/0, /*params=*/nullptr); for (size_t ks_index = 0; ks_index < ks(); ks_index++) { @@ -2469,15 +2469,15 @@ void GemmMicrokernelTester::Test( for (size_t j = 0; j < n(); j++) { EXPECT_LE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_max) << "at " << i << ", " << i << ": reference = " << c_ref[i * n() + j] - << ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr() + << ", optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr() << " x " << kr() << ", M x N x KC x KS = " << m() << " x " << n() << " x " << k() << " x " << ks(); EXPECT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_min) << "at " << i << ", " << i << ": reference = " << c_ref[i * n() + j] - << ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr() + << ", optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr() << " x " << kr() << ", M x N x KC x KS = " << m() << " x " << n() << " x " << k() << " x " << ks(); EXPECT_NEAR(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_ref[i * n() + j], std::max(1.0e-4f, std::abs(c_ref[i * n() + j]) * 1.0e-2f)) << "at " << i << ", " << i << ": reference = " << c_ref[i * n() + j] - << ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr() + << ", optimized = " << (float)c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr() << " x " << kr() << ", M x N x KC x KS = " << m() << " x " << n() << " x " << k() << " x " << ks(); } } diff --git a/test/reduce-microkernel-tester.h b/test/reduce-microkernel-tester.h index 6a2e2c1607da..52150334d85b 100644 --- a/test/reduce-microkernel-tester.h +++ b/test/reduce-microkernel-tester.h @@ -70,7 +70,7 @@ class ReduceMicrokernelTester { } // Call optimized micro-kernel. - xnn_float16 output[2] = {std::nanf(""), std::nanf("")}; + xnn_float16 output[2] = {(xnn_float16)std::nanf(""), (xnn_float16)std::nanf("")}; reduce(batch_size() * sizeof(xnn_float16), input.data(), output, init_params != nullptr ? ¶ms : nullptr); // Verify results. diff --git a/test/softmax-operator-tester.h b/test/softmax-operator-tester.h index d5091c11ae23..2b2fb9a04612 100644 --- a/test/softmax-operator-tester.h +++ b/test/softmax-operator-tester.h @@ -130,10 +130,10 @@ class SoftMaxOperatorTester { for (size_t i = 0; i < batch_size(); i++) { float sum_exp = 0.0; for (size_t c = 0; c < channels(); c++) { - sum_exp += std::exp(input[i * input_stride() + c]); + sum_exp += std::exp((float)input[i * input_stride() + c]); } for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = std::exp(input[i * input_stride() + c]) / sum_exp; + output_ref[i * channels() + c] = std::exp((float)input[i * input_stride() + c]) / sum_exp; } } diff --git a/test/static-constant-pad.cc b/test/static-constant-pad.cc index 9c28465f80c9..5e0fdb7ceed9 100644 --- a/test/static-constant-pad.cc +++ b/test/static-constant-pad.cc @@ -9,6 +9,7 @@ #include // For size_t. #include #include // For std::unique_ptr. +#include #include // For std::uniform_real_distribution. #include // For std::vector. @@ -137,8 +138,11 @@ TEST_F(StaticConstantPadTestF16, define) std::array post_paddings; std::fill(pre_paddings.begin(), pre_paddings.begin() + dims.size(), dim_dist(rng)); std::fill(post_paddings.begin(), post_paddings.begin() + dims.size(), dim_dist(rng)); - xnn_float16 padding_value = f32dist(rng); - uint32_t padding_value_as_bits = padding_value.value; + union { + xnn_float16 padding_value; + uint16_t padding_value_as_bits; + }; + padding_value = f32dist(rng); ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); diff --git a/test/unary-operator-tester.h b/test/unary-operator-tester.h index 4b8715a24344..d80314fdb0fe 100644 --- a/test/unary-operator-tester.h +++ b/test/unary-operator-tester.h @@ -192,7 +192,7 @@ class UnaryOperatorTester { EXPECT_NEAR(y_ref, y, AbsTolF16(y_ref)) << "at batch " << batch << " / " << batch_size() << ", channel " << channel << " / " << channels() << ", input " - << input; + << (float)input; } virtual void CheckResultQS8(int8_t y, float y_ref, size_t batch, size_t channel, int8_t input) const { diff --git a/test/vbinary-microkernel-tester.h b/test/vbinary-microkernel-tester.h index f51cc0a03786..c016644deb7a 100644 --- a/test/vbinary-microkernel-tester.h +++ b/test/vbinary-microkernel-tester.h @@ -45,10 +45,10 @@ class VBinaryMicrokernelTester { result[i] = a[i] + b[i * stride_b]; break; case OpType::CopySign: - result[i] = std::copysign(a[i], b[i * stride_b]); + result[i] = copy_sign(a[i], b[i * stride_b]); break; case OpType::RCopySign: - result[i] = std::copysign(b[i * stride_b], a[i]); + result[i] = copy_sign(b[i * stride_b], a[i]); break; case OpType::Div: result[i] = a[i] / b[i * stride_b]; @@ -231,6 +231,19 @@ class VBinaryMicrokernelTester { uint8_t qmin_{0}; uint8_t qmax_{255}; size_t iterations_{15}; + + static float copy_sign(float a, float b) { + return std::copysign(a, b); + } + + static int32_t copy_sign(int32_t a, int32_t b) { + return (int32_t)std::copysign((float)a, (float)b); + } + + static xnn_float16 copy_sign(xnn_float16 a, xnn_float16 b) { + return (xnn_float16)std::copysign((float)a, (float)b); + } + }; #define XNN_TEST_BINARY_BATCH_EQ(ukernel, arch_flags, batch_tile, is_binaryc, \ diff --git a/test/vcvt-microkernel-tester.cc b/test/vcvt-microkernel-tester.cc index d294192cf785..7f8bea3801b9 100644 --- a/test/vcvt-microkernel-tester.cc +++ b/test/vcvt-microkernel-tester.cc @@ -47,7 +47,7 @@ void VCvtMicrokernelTester::Test( ASSERT_EQ(float_as_uint32(output[i]), float_as_uint32(input[i])) << "at " << i << " / " << batch_size() << ", x[" << i << "] = 0x" - << std::hex << std::setw(4) << std::setfill('0') << input[i]; + << std::hex << std::setw(4) << std::setfill('0') << (float) input[i]; } } } @@ -121,8 +121,8 @@ void VCvtMicrokernelTester::Test(xnn_f16_qs8_vcvt_ukernel_fn vcvt, static_cast(output_ref[i]), 1) << "at " << i << " / " << batch_size() << ", x[" << i << "] = 0x" << std::hex << std::setw(8) << std::setfill('0') - << float_as_uint32(input[i]) << " (" << input[i] << ")" << " INPUT " - << input[i] << " scale " << scale() << " zp " + << float_as_uint32(input[i]) << " (" << (float)input[i] << ")" << " INPUT " + << (float)input[i] << " scale " << scale() << " zp " << (int)output_zero_point(); } } diff --git a/test/vunary-microkernel-tester.h b/test/vunary-microkernel-tester.h index 6fa97a90f0b6..b99c40d4d6ab 100644 --- a/test/vunary-microkernel-tester.h +++ b/test/vunary-microkernel-tester.h @@ -321,7 +321,7 @@ class VUnaryMicrokernelTester { for (size_t i = 0; i < batch_size(); i++) { ASSERT_NEAR(y[i], y_ref[i], tol(y_ref[i])) << "at " << i << " / " << batch_size() << ", x[" << i - << "] = " << std::scientific << x[i]; + << "] = " << std::scientific << (float)x[i]; } } }