diff --git a/BUILD.bazel b/BUILD.bazel index 16fbf7adac4..c4be242dc19 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -160,6 +160,7 @@ SUBGRAPH_SRCS = [ "src/subgraph/minimum2.c", "src/subgraph/multiply2.c", "src/subgraph/negate.c", + "src/subgraph/or.c", "src/subgraph/prelu.c", "src/subgraph/reciprocal-square-root.c", "src/subgraph/reshape-helpers.c", diff --git a/CMakeLists.txt b/CMakeLists.txt index 7d39c361fdf..c59808fa51c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -411,6 +411,7 @@ SET(SUBGRAPH_SRCS src/subgraph/minimum2.c src/subgraph/multiply2.c src/subgraph/negate.c + src/subgraph/or.c src/subgraph/prelu.c src/subgraph/reciprocal-square-root.c src/subgraph/reshape-helpers.c @@ -1627,6 +1628,11 @@ IF(XNNPACK_BUILD_TESTS) TARGET_LINK_LIBRARIES(negate-nc-test PRIVATE XNNPACK unary-operator-tester fp16 GTest::gtest GTest::gtest_main) ADD_TEST(NAME negate-nc-test COMMAND negate-nc-test) + ADD_EXECUTABLE(or-nd-test test/or-nd.cc) + TARGET_INCLUDE_DIRECTORIES(or-nd-test PRIVATE src test) + TARGET_LINK_LIBRARIES(or-nd-test PRIVATE XNNPACK binary-elementwise-operator-tester fp16 GTest::gtest GTest::gtest_main) + ADD_TEST(NAME or-nd-test COMMAND or-nd-test) + ADD_EXECUTABLE(prelu-nc-test test/prelu-nc.cc) TARGET_INCLUDE_DIRECTORIES(prelu-nc-test PRIVATE src test) TARGET_LINK_LIBRARIES(prelu-nc-test PRIVATE XNNPACK fp16 GTest::gtest GTest::gtest_main) @@ -1982,6 +1988,11 @@ IF(XNNPACK_BUILD_TESTS) TARGET_LINK_LIBRARIES(negate-test PRIVATE XNNPACK fp16 GTest::gtest GTest::gtest_main subgraph) ADD_TEST(NAME negate-test COMMAND negate-test) + ADD_EXECUTABLE(or-test test/or.cc) + TARGET_INCLUDE_DIRECTORIES(or-test PRIVATE src test) + TARGET_LINK_LIBRARIES(or-test PRIVATE XNNPACK fp16 GTest::gtest GTest::gtest_main subgraph) + ADD_TEST(NAME or-test COMMAND or-test) + ADD_EXECUTABLE(prelu-test test/prelu.cc) TARGET_INCLUDE_DIRECTORIES(prelu-test PRIVATE src test) TARGET_LINK_LIBRARIES(prelu-test PRIVATE XNNPACK fp16 GTest::gtest GTest::gtest_main subgraph) @@ -2789,6 +2800,18 @@ IF(XNNPACK_BUILD_TESTS) TARGET_LINK_LIBRARIES(s32-vmulc-test PRIVATE vbinaryc-microkernel-tester hardware-config logging microkernels-all microparams-init) ADD_TEST(NAME s32-vmulc-test COMMAND s32-vmulc-test) + ADD_EXECUTABLE(s32-vor-test test/s32-vor.cc) + TARGET_INCLUDE_DIRECTORIES(s32-vor-test PRIVATE include src test) + TARGET_LINK_LIBRARIES(s32-vor-test PRIVATE fp16 pthreadpool GTest::gtest GTest::gtest_main) + TARGET_LINK_LIBRARIES(s32-vor-test PRIVATE vbinary-microkernel-tester hardware-config logging microkernels-all microparams-init) + ADD_TEST(NAME s32-vor-test COMMAND s32-vor-test) + + ADD_EXECUTABLE(s32-vorc-test test/s32-vorc.cc) + TARGET_INCLUDE_DIRECTORIES(s32-vorc-test PRIVATE include src test) + TARGET_LINK_LIBRARIES(s32-vorc-test PRIVATE fp16 pthreadpool GTest::gtest GTest::gtest_main) + TARGET_LINK_LIBRARIES(s32-vorc-test PRIVATE vbinaryc-microkernel-tester hardware-config logging microkernels-all microparams-init) + ADD_TEST(NAME s32-vorc-test COMMAND s32-vorc-test) + ADD_EXECUTABLE(f16-vcmul-test test/f16-vcmul.cc) SET_TARGET_PROPERTIES(f16-vcmul-test PROPERTIES CXX_EXTENSIONS YES) TARGET_INCLUDE_DIRECTORIES(f16-vcmul-test PRIVATE include src test) diff --git a/cmake/gen/avx2_microkernels.cmake b/cmake/gen/avx2_microkernels.cmake index d165cbf3e39..5d47d8defcf 100644 --- a/cmake/gen/avx2_microkernels.cmake +++ b/cmake/gen/avx2_microkernels.cmake @@ -570,6 +570,8 @@ SET(ALL_AVX2_MICROKERNEL_SRCS src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c src/s32-vmul/gen/s32-vmul-avx2.c src/s32-vmul/gen/s32-vmulc-avx2.c + src/s32-vor/gen/s32-vor-avx2.c + src/s32-vor/gen/s32-vorc-avx2.c src/x8-lut/gen/x8-lut-avx2-u32.c src/x8-lut/gen/x8-lut-avx2-u64.c src/x8-lut/gen/x8-lut-avx2-u96.c diff --git a/cmake/gen/avx512f_microkernels.cmake b/cmake/gen/avx512f_microkernels.cmake index 099ec1a04cb..27be19c1338 100644 --- a/cmake/gen/avx512f_microkernels.cmake +++ b/cmake/gen/avx512f_microkernels.cmake @@ -311,5 +311,7 @@ SET(ALL_AVX512F_MICROKERNEL_SRCS src/math/f32-sqrt-avx512f-nr2fma.c src/s32-vmul/gen/s32-vmul-avx512f.c src/s32-vmul/gen/s32-vmulc-avx512f.c + src/s32-vor/gen/s32-vor-avx512f.c + src/s32-vor/gen/s32-vorc-avx512f.c src/x32-packw/gen/x32-packw-x16-gemm-goi-avx512f-u4-prfm.c src/x32-packw/gen/x32-packw-x16-gemm-goi-avx512f-u4.c) diff --git a/cmake/gen/neon_microkernels.cmake b/cmake/gen/neon_microkernels.cmake index 407389db353..e01a30d2dbb 100644 --- a/cmake/gen/neon_microkernels.cmake +++ b/cmake/gen/neon_microkernels.cmake @@ -894,6 +894,8 @@ SET(ALL_NEON_MICROKERNEL_SRCS src/s16-window/gen/s16-window-shift15-neon-u32.c src/s32-vmul/gen/s32-vmul-neon.c src/s32-vmul/gen/s32-vmulc-neon.c + src/s32-vor/gen/s32-vor-neon.c + src/s32-vor/gen/s32-vorc-neon.c src/u8-ibilinear/gen/u8-ibilinear-neon-c8.c src/u8-ibilinear/gen/u8-ibilinear-neon-c16.c src/u8-maxpool/u8-maxpool-9p8x-minmax-neon-c16.c diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake index ebea43802e4..559d1a740d7 100644 --- a/cmake/gen/scalar_microkernels.cmake +++ b/cmake/gen/scalar_microkernels.cmake @@ -1052,6 +1052,8 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS src/s16-window/gen/s16-window-scalar-u4.c src/s32-vmul/gen/s32-vmul-scalar.c src/s32-vmul/gen/s32-vmulc-scalar.c + src/s32-vor/gen/s32-vor-scalar.c + src/s32-vor/gen/s32-vorc-scalar.c src/u8-ibilinear/gen/u8-ibilinear-scalar-c1.c src/u8-ibilinear/gen/u8-ibilinear-scalar-c2.c src/u8-ibilinear/gen/u8-ibilinear-scalar-c4.c diff --git a/cmake/gen/sse41_microkernels.cmake b/cmake/gen/sse41_microkernels.cmake index 6f6383f3ed8..cec3a788bd4 100644 --- a/cmake/gen/sse41_microkernels.cmake +++ b/cmake/gen/sse41_microkernels.cmake @@ -381,5 +381,7 @@ SET(ALL_SSE41_MICROKERNEL_SRCS src/s8-vclamp/s8-vclamp-sse41-u64.c src/s32-vmul/gen/s32-vmul-sse41.c src/s32-vmul/gen/s32-vmulc-sse41.c + src/s32-vor/gen/s32-vor-sse41.c + src/s32-vor/gen/s32-vorc-sse41.c src/u8-ibilinear/gen/u8-ibilinear-sse41-c8.c src/u8-ibilinear/gen/u8-ibilinear-sse41-c16.c) diff --git a/cmake/gen/wasmsimd_microkernels.cmake b/cmake/gen/wasmsimd_microkernels.cmake index 1f3ea9da1b8..d58738bfaa9 100644 --- a/cmake/gen/wasmsimd_microkernels.cmake +++ b/cmake/gen/wasmsimd_microkernels.cmake @@ -1190,6 +1190,8 @@ SET(ALL_WASMSIMD_MICROKERNEL_SRCS src/s8-vclamp/s8-vclamp-wasmsimd-u64.c src/s32-vmul/gen/s32-vmul-wasmsimd.c src/s32-vmul/gen/s32-vmulc-wasmsimd.c + src/s32-vor/gen/s32-vor-wasmsimd.c + src/s32-vor/gen/s32-vorc-wasmsimd.c src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c8.c src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c16.c src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-mul32-c8.c diff --git a/gen/avx2_microkernels.bzl b/gen/avx2_microkernels.bzl index 05ed2b08c81..0bfec2769cd 100644 --- a/gen/avx2_microkernels.bzl +++ b/gen/avx2_microkernels.bzl @@ -566,6 +566,8 @@ ALL_AVX2_MICROKERNEL_SRCS = [ "src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c", "src/s32-vmul/gen/s32-vmul-avx2.c", "src/s32-vmul/gen/s32-vmulc-avx2.c", + "src/s32-vor/gen/s32-vor-avx2.c", + "src/s32-vor/gen/s32-vorc-avx2.c", "src/x8-lut/gen/x8-lut-avx2-u32.c", "src/x8-lut/gen/x8-lut-avx2-u64.c", "src/x8-lut/gen/x8-lut-avx2-u96.c", diff --git a/gen/avx512f_microkernels.bzl b/gen/avx512f_microkernels.bzl index 407c7a31eed..af40f9844ab 100644 --- a/gen/avx512f_microkernels.bzl +++ b/gen/avx512f_microkernels.bzl @@ -307,6 +307,8 @@ ALL_AVX512F_MICROKERNEL_SRCS = [ "src/math/f32-sqrt-avx512f-nr2fma.c", "src/s32-vmul/gen/s32-vmul-avx512f.c", "src/s32-vmul/gen/s32-vmulc-avx512f.c", + "src/s32-vor/gen/s32-vor-avx512f.c", + "src/s32-vor/gen/s32-vorc-avx512f.c", "src/x32-packw/gen/x32-packw-x16-gemm-goi-avx512f-u4-prfm.c", "src/x32-packw/gen/x32-packw-x16-gemm-goi-avx512f-u4.c", ] diff --git a/gen/neon_microkernels.bzl b/gen/neon_microkernels.bzl index 4c61e0611b6..fce1853765d 100644 --- a/gen/neon_microkernels.bzl +++ b/gen/neon_microkernels.bzl @@ -890,6 +890,8 @@ ALL_NEON_MICROKERNEL_SRCS = [ "src/s16-window/gen/s16-window-shift15-neon-u32.c", "src/s32-vmul/gen/s32-vmul-neon.c", "src/s32-vmul/gen/s32-vmulc-neon.c", + "src/s32-vor/gen/s32-vor-neon.c", + "src/s32-vor/gen/s32-vorc-neon.c", "src/u8-ibilinear/gen/u8-ibilinear-neon-c8.c", "src/u8-ibilinear/gen/u8-ibilinear-neon-c16.c", "src/u8-maxpool/u8-maxpool-9p8x-minmax-neon-c16.c", diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl index 8e97dde5c94..2675f6e2aa9 100644 --- a/gen/scalar_microkernels.bzl +++ b/gen/scalar_microkernels.bzl @@ -1048,6 +1048,8 @@ ALL_SCALAR_MICROKERNEL_SRCS = [ "src/s16-window/gen/s16-window-scalar-u4.c", "src/s32-vmul/gen/s32-vmul-scalar.c", "src/s32-vmul/gen/s32-vmulc-scalar.c", + "src/s32-vor/gen/s32-vor-scalar.c", + "src/s32-vor/gen/s32-vorc-scalar.c", "src/u8-ibilinear/gen/u8-ibilinear-scalar-c1.c", "src/u8-ibilinear/gen/u8-ibilinear-scalar-c2.c", "src/u8-ibilinear/gen/u8-ibilinear-scalar-c4.c", diff --git a/gen/sse41_microkernels.bzl b/gen/sse41_microkernels.bzl index 17251c8430e..8261ca439e7 100644 --- a/gen/sse41_microkernels.bzl +++ b/gen/sse41_microkernels.bzl @@ -377,6 +377,8 @@ ALL_SSE41_MICROKERNEL_SRCS = [ "src/s8-vclamp/s8-vclamp-sse41-u64.c", "src/s32-vmul/gen/s32-vmul-sse41.c", "src/s32-vmul/gen/s32-vmulc-sse41.c", + "src/s32-vor/gen/s32-vor-sse41.c", + "src/s32-vor/gen/s32-vorc-sse41.c", "src/u8-ibilinear/gen/u8-ibilinear-sse41-c8.c", "src/u8-ibilinear/gen/u8-ibilinear-sse41-c16.c", ] diff --git a/gen/wasmsimd_microkernels.bzl b/gen/wasmsimd_microkernels.bzl index 061d7d6d818..a80b74b1a36 100644 --- a/gen/wasmsimd_microkernels.bzl +++ b/gen/wasmsimd_microkernels.bzl @@ -1186,6 +1186,8 @@ ALL_WASMSIMD_MICROKERNEL_SRCS = [ "src/s8-vclamp/s8-vclamp-wasmsimd-u64.c", "src/s32-vmul/gen/s32-vmul-wasmsimd.c", "src/s32-vmul/gen/s32-vmulc-wasmsimd.c", + "src/s32-vor/gen/s32-vor-wasmsimd.c", + "src/s32-vor/gen/s32-vorc-wasmsimd.c", "src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c8.c", "src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c16.c", "src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-mul32-c8.c", diff --git a/include/xnnpack.h b/include/xnnpack.h index c3b81bf619d..913db41c1b2 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -1789,6 +1789,22 @@ enum xnn_status xnn_define_negate( uint32_t output_id, uint32_t flags); +/// Define a Bitwsie OR Node and add it to a Subgraph. +/// +/// The OR node peforms bitwise OR between first and second input. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param input1_id - Value ID for the first input tensor. The input tensor must be defined in the @a subgraph. +/// @param input2_id - Value ID for the second input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. +/// @param flags - binary features of the OR Node. No supported flags are currently defined. +enum xnn_status xnn_define_or( + xnn_subgraph_t subgraph, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags); + /// Define a Sigmoid Node and add it to a Subgraph. /// /// @param subgraph - a Subgraph object that will own the created Node. @@ -5437,6 +5453,24 @@ enum xnn_status xnn_run_negate_nc_f32( uint32_t flags, pthreadpool_t threadpool); +enum xnn_status xnn_create_or_nd_s32( + uint32_t flags, + xnn_operator_t* or_op_out); + +enum xnn_status xnn_reshape_or_nd_s32( + xnn_operator_t or_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_or_nd_s32( + xnn_operator_t or_op, + const int32_t* input1, + const int32_t* input2, + int32_t* output); + enum xnn_status xnn_create_prelu_nc_f16( size_t input_channels, size_t slope_channels, diff --git a/scripts/generate-s32-vor.sh b/scripts/generate-s32-vor.sh new file mode 100755 index 00000000000..583edcc7c0c --- /dev/null +++ b/scripts/generate-s32-vor.sh @@ -0,0 +1,23 @@ +#!/bin/sh +# Copyright 2024 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +##################################### SIMD VOR ##################################### +tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=scalar -D BATCH_TILES=1,2,4,8 -o src/s32-vor/gen/s32-vor-scalar.c & +tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=sse41 -D BATCH_TILES=4,8,12,16 -o src/s32-vor/gen/s32-vor-sse41.c & +tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=wasmsimd -D BATCH_TILES=4,8,12,16 -o src/s32-vor/gen/s32-vor-wasmsimd.c & +tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=neon -D BATCH_TILES=4,8,12,16 -o src/s32-vor/gen/s32-vor-neon.c & +tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=avx2 -D BATCH_TILES=8,16,24,32 -o src/s32-vor/gen/s32-vor-avx2.c & +tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64 -o src/s32-vor/gen/s32-vor-avx512f.c & + +##################################### SIMD VORC ##################################### +tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=scalar -D BATCH_TILES=1,2,4,8 -o src/s32-vor/gen/s32-vorc-scalar.c & +tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=sse41 -D BATCH_TILES=4,8,12,16 -o src/s32-vor/gen/s32-vorc-sse41.c & +tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=wasmsimd -D BATCH_TILES=4,8,12,16 -o src/s32-vor/gen/s32-vorc-wasmsimd.c & +tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=neon -D BATCH_TILES=4,8,12,16 -o src/s32-vor/gen/s32-vorc-neon.c & +tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=avx2 -D BATCH_TILES=8,16,24,32 -o src/s32-vor/gen/s32-vorc-avx2.c & +tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64 -o src/s32-vor/gen/s32-vorc-avx512f.c & + +wait diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh index 7af7fcdb3c8..b1ea04f9134 100755 --- a/scripts/generate-tests.sh +++ b/scripts/generate-tests.sh @@ -194,6 +194,9 @@ tools/generate-vbinary-test.py --tester VBinaryCMicrokernelTester --spec test/qu tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --spec test/s32-vmul.yaml --output test/s32-vmul.cc & tools/generate-vbinary-test.py --tester VBinaryCMicrokernelTester --spec test/s32-vmulc.yaml --output test/s32-vmulc.cc & +tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --spec test/s32-vor.yaml --output test/s32-vor.cc & +tools/generate-vbinary-test.py --tester VBinaryCMicrokernelTester --spec test/s32-vorc.yaml --output test/s32-vorc.cc & + ### Tests for VUnary micro-kernels tools/generate-vunary-test.py --spec test/bf16-vabs.yaml --output test/bf16-vabs.cc & diff --git a/src/amalgam/gen/avx2.c b/src/amalgam/gen/avx2.c index 8cbc6e0e7bd..b1292da20cd 100644 --- a/src/amalgam/gen/avx2.c +++ b/src/amalgam/gen/avx2.c @@ -15832,3 +15832,103 @@ void xnn_s32_vmulc_ukernel__avx2_u16( xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); } } + +void xnn_s32_vor_ukernel__avx2_u16( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 8); + + for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + input_a += 16; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + input_b += 16; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 16; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__avx2_u16( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 8); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + input1 += 16; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 16; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/amalgam/gen/avx512f.c b/src/amalgam/gen/avx512f.c index 986eb3b61a7..802f8bd08eb 100644 --- a/src/amalgam/gen/avx512f.c +++ b/src/amalgam/gen/avx512f.c @@ -2231,8 +2231,6 @@ void xnn_f32_rdsum_ukernel_7p7x__avx512f_c64( } for (int i = 0; i < channels >> 4; ++i) { vacc[i] = _mm512_add_ps(vo[i], vacc[i]); - vacc[i] = _mm512_max_ps(vacc[i], vmin); - vacc[i] = _mm512_min_ps(vacc[i], vmax); } for (int i = 0; i < channels >> 4; ++i) { _mm512_storeu_ps(output, vacc[i]); output += 16; @@ -2240,8 +2238,6 @@ void xnn_f32_rdsum_ukernel_7p7x__avx512f_c64( if (remainder) { const size_t pos = num_full_chunks; __m512 vout = vacc[pos]; - vout = _mm512_max_ps(vout, vmin); - vout = _mm512_min_ps(vout, vmax); vout = _mm512_maskz_add_ps(vmask, vout, _mm512_maskz_loadu_ps(vmask, output)); _mm512_mask_storeu_ps(output, vmask, vout); } @@ -5557,3 +5553,103 @@ void xnn_s32_vmulc_ukernel__avx512f_u32( xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); } } + +void xnn_s32_vor_ukernel__avx512f_u32( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 16); + + for (; batch >= 32 * sizeof(int32_t); batch -= 32 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + input_a += 32; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + input_b += 32; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 32; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__avx512f_u32( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 16); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 32 * sizeof(int32_t); batch -= 32 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + input1 += 32; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 32; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/amalgam/gen/neon.c b/src/amalgam/gen/neon.c index 494a9b769fe..b551daa3e62 100644 --- a/src/amalgam/gen/neon.c +++ b/src/amalgam/gen/neon.c @@ -30606,3 +30606,103 @@ void xnn_s32_vmulc_ukernel__neon_u8( xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); } } + +void xnn_s32_vor_ukernel__neon_u8( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + input_a += 8; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + input_b += 8; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 8; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__neon_u8( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + input1 += 8; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 8; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/amalgam/gen/scalar.c b/src/amalgam/gen/scalar.c index a1e98ae3fb9..cd828afa941 100644 --- a/src/amalgam/gen/scalar.c +++ b/src/amalgam/gen/scalar.c @@ -6,9 +6,9 @@ // Auto-generated file. Do not edit! // Generator: tools/update-microkernels.py -a -#include #include #include +#include #include #include #include @@ -33351,3 +33351,87 @@ void xnn_s32_vmulc_ukernel__scalar_u2( output += xnn_simd_size_s32; } } + +void xnn_s32_vor_ukernel__scalar_u2( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 1); + + for (; batch >= 2 * sizeof(int32_t); batch -= 2 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + input_a += 2; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + input_b += 2; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 2; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } +} + +void xnn_s32_vorc_ukernel__scalar_u2( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 1); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 2 * sizeof(int32_t); batch -= 2 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + input1 += 2; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 2; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } +} diff --git a/src/amalgam/gen/sse41.c b/src/amalgam/gen/sse41.c index fa0ad329bb2..85888c4b2b2 100644 --- a/src/amalgam/gen/sse41.c +++ b/src/amalgam/gen/sse41.c @@ -11478,3 +11478,103 @@ void xnn_s32_vmulc_ukernel__sse41_u8( xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); } } + +void xnn_s32_vor_ukernel__sse41_u8( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + input_a += 8; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + input_b += 8; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 8; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__sse41_u8( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + input1 += 8; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 8; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/amalgam/gen/wasmsimd.c b/src/amalgam/gen/wasmsimd.c index cb9daa3efe9..e0902309215 100644 --- a/src/amalgam/gen/wasmsimd.c +++ b/src/amalgam/gen/wasmsimd.c @@ -40917,3 +40917,117 @@ void xnn_s32_vmulc_ukernel__wasmsimd_u16( xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); } } + +void xnn_s32_vor_ukernel__wasmsimd_u16( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32); + input_a += 16; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32)); + input_b += 16; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 16; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__wasmsimd_u16( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32)); + input1 += 16; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 16; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c index d695fdce271..ab768d8668e 100644 --- a/src/configs/binary-elementwise-config.c +++ b/src/configs/binary-elementwise-config.c @@ -32,6 +32,7 @@ static struct xnn_binary_elementwise_config f32_vsqrdiff_config = {0}; static struct xnn_binary_elementwise_config s32_vmul_config = {0}; +static struct xnn_binary_elementwise_config s32_vor_config = {0}; static struct xnn_binary_elementwise_config qs8_vadd_config = {0}; static struct xnn_binary_elementwise_config qs8_vmul_config = {0}; @@ -55,6 +56,7 @@ XNN_INIT_ONCE_GUARD(f32_vmul); XNN_INIT_ONCE_GUARD(f32_vsub); XNN_INIT_ONCE_GUARD(f32_vsqrdiff); XNN_INIT_ONCE_GUARD(s32_vmul); +XNN_INIT_ONCE_GUARD(s32_vor); XNN_INIT_ONCE_GUARD(qs8_vadd); XNN_INIT_ONCE_GUARD(qs8_vmul); XNN_INIT_ONCE_GUARD(qu8_vadd); @@ -556,6 +558,61 @@ static void init_s32_vmul_config(void) { #endif } +static void init_s32_vor_config(void) { + #if XNN_ARCH_ARM + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->use_arm_neon) { + s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__neon_u8; + s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__neon_u8; + s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__neon_u8; + s32_vor_config.linear.element_tile = 8; + } + else if (!XNN_PLATFORM_MOBILE) { + s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__scalar_u2; + s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__scalar_u2; + s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__scalar_u2; + s32_vor_config.linear.element_tile = 2; + } + #elif XNN_ARCH_ARM64 + s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__neon_u8; + s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__neon_u8; + s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__neon_u8; + s32_vor_config.linear.element_tile = 8; + #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { + s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__avx512f_u32; + s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__avx512f_u32; + s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__avx512f_u32; + s32_vor_config.linear.element_tile = 32; + } + else if (hardware_config->use_x86_avx2) { + s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__avx2_u16; + s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__avx2_u16; + s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__avx2_u16; + s32_vor_config.linear.element_tile = 16; + } + else { + s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__sse41_u8; + s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__sse41_u8; + s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__sse41_u8; + s32_vor_config.linear.element_tile = 8; + } + #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__wasmsimd_u16; + s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__wasmsimd_u16; + s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__wasmsimd_u16; + s32_vor_config.linear.element_tile = 16; + #else + s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__scalar_u2; + s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__scalar_u2; + s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__scalar_u2; + s32_vor_config.linear.element_tile = 2; + #endif +} + static void init_f32_vdiv_config(void) { #if XNN_ARCH_ARM const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -1400,6 +1457,15 @@ const struct xnn_binary_elementwise_config* xnn_init_s32_vmul_config() { return &s32_vmul_config; } +const struct xnn_binary_elementwise_config* xnn_init_s32_vor_config() { + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + if (hardware_config == NULL) { + return NULL; + } + XNN_INIT_ONCE(s32_vor); + return &s32_vor_config; +} + const struct xnn_binary_elementwise_config* xnn_init_f32_vdiv_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { diff --git a/src/enums/node-type.c b/src/enums/node-type.c index 0db7e426904..0593d1201d1 100644 --- a/src/enums/node-type.c +++ b/src/enums/node-type.c @@ -13,10 +13,10 @@ #include "xnnpack/node-type.h" #if XNN_LOG_LEVEL > 0 -static const uint16_t offset[62] = { +static const uint16_t offset[63] = { 0, 8, 12, 17, 35, 54, 71, 93, 101, 107, 120, 133, 146, 159, 167, 182, 187, 197, 214, 232, 257, 264, 268, 272, 284, - 296, 308, 314, 330, 353, 358, 384, 410, 432, 454, 464, 468, 479, 494, 503, 512, 522, 529, 535, 558, 569, 574, 603, - 611, 619, 637, 644, 656, 675, 695, 707, 722, 748, 761, 778, 787, 792 + 296, 308, 314, 330, 353, 358, 384, 410, 432, 454, 464, 468, 479, 494, 503, 512, 522, 529, 532, 538, 561, 572, 577, + 606, 614, 622, 640, 647, 659, 678, 698, 710, 725, 751, 764, 781, 790, 795 }; static const char data[] = @@ -62,6 +62,7 @@ static const char data[] = "Minimum2\0" "Multiply2\0" "Negate\0" + "OR\0" "PReLU\0" "Reciprocal Square Root\0" "Reshape 2D\0" diff --git a/src/enums/node-type.yaml b/src/enums/node-type.yaml index 7c825e1e28f..06a9979d297 100644 --- a/src/enums/node-type.yaml +++ b/src/enums/node-type.yaml @@ -89,6 +89,8 @@ string: "Multiply2" - name: xnn_node_type_negate string: "Negate" +- name: xnn_node_type_or + string: "OR" - name: xnn_node_type_prelu string: "PReLU" - name: xnn_node_type_reciprocal_square_root diff --git a/src/enums/operator-type.c b/src/enums/operator-type.c index a01bfeb4558..53a64f488d0 100644 --- a/src/enums/operator-type.c +++ b/src/enums/operator-type.c @@ -12,16 +12,16 @@ #include "xnnpack/operator-type.h" -static const uint16_t offset[169] = { +static const uint16_t offset[170] = { 0, 8, 22, 36, 50, 64, 78, 92, 119, 147, 175, 203, 230, 257, 289, 321, 364, 382, 400, 425, 451, 467, 483, 498, 513, 535, 558, 581, 604, 627, 650, 673, 696, 719, 742, 760, 783, 806, 830, 848, 871, 895, 919, 943, 967, 1002, 1037, 1061, 1085, 1109, 1123, 1138, 1153, 1173, 1199, 1225, 1262, 1288, 1318, 1344, 1376, 1408, 1434, 1461, 1488, 1505, 1522, 1556, 1590, 1604, 1618, 1632, 1646, 1662, 1678, 1704, 1730, 1762, 1794, 1831, 1868, 1905, 1942, 1979, 2016, 2053, 2079, 2111, 2137, 2152, 2186, 2220, 2254, 2288, 2322, 2356, 2386, 2416, 2436, 2456, 2477, 2498, 2519, 2540, 2554, - 2578, 2602, 2625, 2648, 2666, 2684, 2699, 2714, 2732, 2750, 2769, 2788, 2807, 2826, 2845, 2862, 2879, 2895, 2911, - 2944, 2977, 3005, 3033, 3061, 3089, 3116, 3143, 3160, 3177, 3218, 3259, 3277, 3295, 3313, 3331, 3346, 3362, 3378, - 3396, 3414, 3432, 3458, 3485, 3512, 3529, 3546, 3568, 3590, 3619, 3648, 3667, 3686, 3705, 3724, 3739, 3754, 3769, - 3784, 3803, 3823, 3843, 3863, 3884, 3905 + 2578, 2602, 2625, 2648, 2666, 2684, 2699, 2714, 2732, 2750, 2769, 2788, 2807, 2826, 2845, 2862, 2879, 2892, 2908, + 2924, 2957, 2990, 3018, 3046, 3074, 3102, 3129, 3156, 3173, 3190, 3231, 3272, 3290, 3308, 3326, 3344, 3359, 3375, + 3391, 3409, 3427, 3445, 3471, 3498, 3525, 3542, 3559, 3581, 3603, 3632, 3661, 3680, 3699, 3718, 3737, 3752, 3767, + 3782, 3797, 3816, 3836, 3856, 3876, 3897, 3918 }; static const char data[] = @@ -146,6 +146,7 @@ static const char data[] = "Multiply (ND, S32)\0" "Negate (NC, F16)\0" "Negate (NC, F32)\0" + "OR (ND, S32)\0" "PReLU (NC, F16)\0" "PReLU (NC, F32)\0" "Reciprocal Square Root (NC, F16)\0" diff --git a/src/enums/operator-type.yaml b/src/enums/operator-type.yaml index 7b153359692..b4230132720 100644 --- a/src/enums/operator-type.yaml +++ b/src/enums/operator-type.yaml @@ -247,6 +247,8 @@ string: "Negate (NC, F16)" - name: xnn_operator_type_negate_nc_f32 string: "Negate (NC, F32)" +- name: xnn_operator_type_or_nd_s32 + string: "OR (ND, S32)" - name: xnn_operator_type_prelu_nc_f16 string: "PReLU (NC, F16)" - name: xnn_operator_type_prelu_nc_f32 diff --git a/src/operators/binary-elementwise-nd.c b/src/operators/binary-elementwise-nd.c index 59261d83776..4a43b38d96e 100644 --- a/src/operators/binary-elementwise-nd.c +++ b/src/operators/binary-elementwise-nd.c @@ -782,6 +782,28 @@ enum xnn_status xnn_create_multiply_nd_s32( multiply_op_out); } +enum xnn_status xnn_create_or_nd_s32( + uint32_t flags, + xnn_operator_t* or_op_out) +{ + const struct xnn_binary_elementwise_config* s32_or_config = xnn_init_s32_vor_config(); + if (s32_or_config == NULL) { + xnn_log_error("failed to create %s operator: unsupported hardware configuration", + xnn_operator_type_to_string(xnn_operator_type_or_nd_s32)); + return xnn_status_unsupported_hardware; + } + + union xnn_s32_default_params params; + + return create_binary_elementwise_nd( + flags, + ¶ms, + ¶ms, + sizeof(params), + xnn_operator_type_or_nd_s32, + &s32_or_config->linear, + or_op_out); +} enum xnn_status xnn_create_subtract_nd_f16( float output_min, @@ -1517,6 +1539,26 @@ enum xnn_status xnn_reshape_multiply_nd_s32( threadpool); } + +enum xnn_status xnn_reshape_or_nd_s32( + xnn_operator_t or_op, + size_t num_input1_dims, + const size_t* input1_shape, + size_t num_input2_dims, + const size_t* input2_shape, + pthreadpool_t threadpool) +{ + + return reshape_binary_elementwise_nd( + or_op, xnn_operator_type_or_nd_s32, + num_input1_dims, input1_shape, + num_input2_dims, input2_shape, + /*log2_element_size=*/XNN_LOG2_SIZEOF_INT32_T, + &or_op->params.s32_default, sizeof(or_op->params.s32_default), + &or_op->params.s32_default, sizeof(or_op->params.s32_default), + threadpool); +} + enum xnn_status xnn_reshape_subtract_nd_f16( xnn_operator_t subtract_op, size_t num_input1_dims, @@ -1836,6 +1878,17 @@ enum xnn_status xnn_setup_multiply_nd_s32( input1, input2, output); } +enum xnn_status xnn_setup_or_nd_s32( + xnn_operator_t or_op, + const int32_t* input1, + const int32_t* input2, + int32_t* output) +{ + return setup_binary_elementwise_nd( + or_op, xnn_operator_type_or_nd_s32, + input1, input2, output); +} + enum xnn_status xnn_setup_subtract_nd_f32( xnn_operator_t subtract_op, diff --git a/src/s32-vor/gen/s32-vor-avx2.c b/src/s32-vor/gen/s32-vor-avx2.c new file mode 100644 index 00000000000..4b053298274 --- /dev/null +++ b/src/s32-vor/gen/s32-vor-avx2.c @@ -0,0 +1,227 @@ +// Auto-generated file. Do not edit! +// Template: src/s32-vor/s32-vor.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "xnnpack/simd/s32-avx2.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + + +void xnn_s32_vor_ukernel__avx2_u8( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 8); + + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__avx2_u16( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 8); + + for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + input_a += 16; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + input_b += 16; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 16; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__avx2_u24( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 8); + + for (; batch >= 24 * sizeof(int32_t); batch -= 24 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + input_a += 24; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + input_b += 24; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + output += 24; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__avx2_u32( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 8); + + for (; batch >= 32 * sizeof(int32_t); batch -= 32 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32); + input_a += 32; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32)); + input_b += 32; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 32; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/s32-vor/gen/s32-vor-avx512f.c b/src/s32-vor/gen/s32-vor-avx512f.c new file mode 100644 index 00000000000..0a49aa6a53e --- /dev/null +++ b/src/s32-vor/gen/s32-vor-avx512f.c @@ -0,0 +1,227 @@ +// Auto-generated file. Do not edit! +// Template: src/s32-vor/s32-vor.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "xnnpack/simd/s32-avx512f.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + + +void xnn_s32_vor_ukernel__avx512f_u16( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 16); + + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__avx512f_u32( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 16); + + for (; batch >= 32 * sizeof(int32_t); batch -= 32 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + input_a += 32; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + input_b += 32; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 32; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__avx512f_u48( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 16); + + for (; batch >= 48 * sizeof(int32_t); batch -= 48 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + input_a += 48; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + input_b += 48; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + output += 48; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__avx512f_u64( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 16); + + for (; batch >= 64 * sizeof(int32_t); batch -= 64 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32); + input_a += 64; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32)); + input_b += 64; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 64; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/s32-vor/gen/s32-vor-neon.c b/src/s32-vor/gen/s32-vor-neon.c new file mode 100644 index 00000000000..3e1346fd599 --- /dev/null +++ b/src/s32-vor/gen/s32-vor-neon.c @@ -0,0 +1,227 @@ +// Auto-generated file. Do not edit! +// Template: src/s32-vor/s32-vor.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "xnnpack/simd/s32-neon.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + + +void xnn_s32_vor_ukernel__neon_u4( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__neon_u8( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + input_a += 8; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + input_b += 8; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 8; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__neon_u12( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= 12 * sizeof(int32_t); batch -= 12 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + input_a += 12; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + input_b += 12; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + output += 12; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__neon_u16( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32); + input_a += 16; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32)); + input_b += 16; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 16; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/s32-vor/gen/s32-vor-scalar.c b/src/s32-vor/gen/s32-vor-scalar.c new file mode 100644 index 00000000000..090c3020c77 --- /dev/null +++ b/src/s32-vor/gen/s32-vor-scalar.c @@ -0,0 +1,211 @@ +// Auto-generated file. Do not edit! +// Template: src/s32-vor/s32-vor.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "xnnpack/simd/s32-scalar.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + + +void xnn_s32_vor_ukernel__scalar_u1( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 1); + + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } +} + +void xnn_s32_vor_ukernel__scalar_u2( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 1); + + for (; batch >= 2 * sizeof(int32_t); batch -= 2 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + input_a += 2; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + input_b += 2; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 2; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } +} + +void xnn_s32_vor_ukernel__scalar_u4( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 1); + + for (; batch >= 4 * sizeof(int32_t); batch -= 4 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32); + input_a += 4; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32)); + input_b += 4; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 4; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } +} + +void xnn_s32_vor_ukernel__scalar_u8( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 1); + + for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_4 = xnn_loadu_s32(input_a + 4 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_5 = xnn_loadu_s32(input_a + 5 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_6 = xnn_loadu_s32(input_a + 6 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_7 = xnn_loadu_s32(input_a + 7 * xnn_simd_size_s32); + input_a += 8; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_4 = (xnn_loadu_s32(input_b + 4 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_5 = (xnn_loadu_s32(input_b + 5 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_6 = (xnn_loadu_s32(input_b + 6 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_7 = (xnn_loadu_s32(input_b + 7 * xnn_simd_size_s32)); + input_b += 8; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3); + xnn_simd_s32_t vy_4 = xnn_or_s32(vin1_4, vin2_4); + xnn_simd_s32_t vy_5 = xnn_or_s32(vin1_5, vin2_5); + xnn_simd_s32_t vy_6 = xnn_or_s32(vin1_6, vin2_6); + xnn_simd_s32_t vy_7 = xnn_or_s32(vin1_7, vin2_7); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + xnn_storeu_s32(output + 4 * xnn_simd_size_s32, vy_4); + xnn_storeu_s32(output + 5 * xnn_simd_size_s32, vy_5); + xnn_storeu_s32(output + 6 * xnn_simd_size_s32, vy_6); + xnn_storeu_s32(output + 7 * xnn_simd_size_s32, vy_7); + output += 8; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } +} diff --git a/src/s32-vor/gen/s32-vor-sse41.c b/src/s32-vor/gen/s32-vor-sse41.c new file mode 100644 index 00000000000..937b332f01a --- /dev/null +++ b/src/s32-vor/gen/s32-vor-sse41.c @@ -0,0 +1,227 @@ +// Auto-generated file. Do not edit! +// Template: src/s32-vor/s32-vor.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "xnnpack/simd/s32-sse41.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + + +void xnn_s32_vor_ukernel__sse41_u4( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__sse41_u8( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + input_a += 8; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + input_b += 8; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 8; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__sse41_u12( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= 12 * sizeof(int32_t); batch -= 12 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + input_a += 12; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + input_b += 12; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + output += 12; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__sse41_u16( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32); + input_a += 16; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32)); + input_b += 16; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 16; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/s32-vor/gen/s32-vor-wasmsimd.c b/src/s32-vor/gen/s32-vor-wasmsimd.c new file mode 100644 index 00000000000..87b7fab3cb7 --- /dev/null +++ b/src/s32-vor/gen/s32-vor-wasmsimd.c @@ -0,0 +1,227 @@ +// Auto-generated file. Do not edit! +// Template: src/s32-vor/s32-vor.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "xnnpack/simd/s32-wasmsimd.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + + +void xnn_s32_vor_ukernel__wasmsimd_u4( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__wasmsimd_u8( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + input_a += 8; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + input_b += 8; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 8; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__wasmsimd_u12( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= 12 * sizeof(int32_t); batch -= 12 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + input_a += 12; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + input_b += 12; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + output += 12; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vor_ukernel__wasmsimd_u16( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) { + xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a); + xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32); + xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32); + input_a += 16; + + xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b); + xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32)); + input_b += 16; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 16; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/s32-vor/gen/s32-vorc-avx2.c b/src/s32-vor/gen/s32-vorc-avx2.c new file mode 100644 index 00000000000..6919ee0ee1f --- /dev/null +++ b/src/s32-vor/gen/s32-vorc-avx2.c @@ -0,0 +1,203 @@ +// Auto-generated file. Do not edit! +// Template: src/s32-vor/s32-vorc.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "xnnpack/simd/s32-avx2.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + + +void xnn_s32_vorc_ukernel__avx2_u8( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 8); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__avx2_u16( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 8); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + input1 += 16; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 16; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__avx2_u24( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 8); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 24 * sizeof(int32_t); batch -= 24 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + input1 += 24; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + output += 24; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__avx2_u32( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 8); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 32 * sizeof(int32_t); batch -= 32 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32)); + input1 += 32; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 32; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/s32-vor/gen/s32-vorc-avx512f.c b/src/s32-vor/gen/s32-vorc-avx512f.c new file mode 100644 index 00000000000..ecf4a2863ed --- /dev/null +++ b/src/s32-vor/gen/s32-vorc-avx512f.c @@ -0,0 +1,203 @@ +// Auto-generated file. Do not edit! +// Template: src/s32-vor/s32-vorc.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "xnnpack/simd/s32-avx512f.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + + +void xnn_s32_vorc_ukernel__avx512f_u16( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 16); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__avx512f_u32( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 16); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 32 * sizeof(int32_t); batch -= 32 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + input1 += 32; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 32; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__avx512f_u48( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 16); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 48 * sizeof(int32_t); batch -= 48 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + input1 += 48; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + output += 48; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__avx512f_u64( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 16); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 64 * sizeof(int32_t); batch -= 64 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32)); + input1 += 64; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 64; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/s32-vor/gen/s32-vorc-neon.c b/src/s32-vor/gen/s32-vorc-neon.c new file mode 100644 index 00000000000..6729f581d9f --- /dev/null +++ b/src/s32-vor/gen/s32-vorc-neon.c @@ -0,0 +1,203 @@ +// Auto-generated file. Do not edit! +// Template: src/s32-vor/s32-vorc.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "xnnpack/simd/s32-neon.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + + +void xnn_s32_vorc_ukernel__neon_u4( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__neon_u8( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + input1 += 8; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 8; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__neon_u12( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 12 * sizeof(int32_t); batch -= 12 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + input1 += 12; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + output += 12; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__neon_u16( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32)); + input1 += 16; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 16; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/s32-vor/gen/s32-vorc-scalar.c b/src/s32-vor/gen/s32-vorc-scalar.c new file mode 100644 index 00000000000..e60eadd40e3 --- /dev/null +++ b/src/s32-vor/gen/s32-vorc-scalar.c @@ -0,0 +1,190 @@ +// Auto-generated file. Do not edit! +// Template: src/s32-vor/s32-vorc.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "xnnpack/simd/s32-scalar.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + + +void xnn_s32_vorc_ukernel__scalar_u1( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 1); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } +} + +void xnn_s32_vorc_ukernel__scalar_u2( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 1); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 2 * sizeof(int32_t); batch -= 2 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + input1 += 2; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 2; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } +} + +void xnn_s32_vorc_ukernel__scalar_u4( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 1); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 4 * sizeof(int32_t); batch -= 4 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32)); + input1 += 4; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 4; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } +} + +void xnn_s32_vorc_ukernel__scalar_u8( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 1); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_4 = (xnn_loadu_s32(input1 + 4 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_5 = (xnn_loadu_s32(input1 + 5 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_6 = (xnn_loadu_s32(input1 + 6 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_7 = (xnn_loadu_s32(input1 + 7 * xnn_simd_size_s32)); + input1 += 8; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2); + xnn_simd_s32_t vy_4 = xnn_or_s32(vin1_4, vin2); + xnn_simd_s32_t vy_5 = xnn_or_s32(vin1_5, vin2); + xnn_simd_s32_t vy_6 = xnn_or_s32(vin1_6, vin2); + xnn_simd_s32_t vy_7 = xnn_or_s32(vin1_7, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + xnn_storeu_s32(output + 4 * xnn_simd_size_s32, vy_4); + xnn_storeu_s32(output + 5 * xnn_simd_size_s32, vy_5); + xnn_storeu_s32(output + 6 * xnn_simd_size_s32, vy_6); + xnn_storeu_s32(output + 7 * xnn_simd_size_s32, vy_7); + output += 8; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } +} diff --git a/src/s32-vor/gen/s32-vorc-sse41.c b/src/s32-vor/gen/s32-vorc-sse41.c new file mode 100644 index 00000000000..38013a5dbdb --- /dev/null +++ b/src/s32-vor/gen/s32-vorc-sse41.c @@ -0,0 +1,203 @@ +// Auto-generated file. Do not edit! +// Template: src/s32-vor/s32-vorc.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "xnnpack/simd/s32-sse41.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + + +void xnn_s32_vorc_ukernel__sse41_u4( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__sse41_u8( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + input1 += 8; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 8; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__sse41_u12( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 12 * sizeof(int32_t); batch -= 12 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + input1 += 12; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + output += 12; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__sse41_u16( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32)); + input1 += 16; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 16; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/s32-vor/gen/s32-vorc-wasmsimd.c b/src/s32-vor/gen/s32-vorc-wasmsimd.c new file mode 100644 index 00000000000..5707f68328d --- /dev/null +++ b/src/s32-vor/gen/s32-vorc-wasmsimd.c @@ -0,0 +1,203 @@ +// Auto-generated file. Do not edit! +// Template: src/s32-vor/s32-vorc.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "xnnpack/simd/s32-wasmsimd.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + + +void xnn_s32_vorc_ukernel__wasmsimd_u4( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__wasmsimd_u8( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + input1 += 8; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + output += 8; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__wasmsimd_u12( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 12 * sizeof(int32_t); batch -= 12 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + input1 += 12; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + output += 12; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} + +void xnn_s32_vorc_ukernel__wasmsimd_u16( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == 4); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1)); + xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32)); + xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32)); + input1 += 16; + + xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2); + xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2); + xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2); + xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2); + + xnn_storeu_s32(output, vy_0); + xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1); + xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2); + xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3); + output += 16; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } +} diff --git a/src/s32-vor/s32-vor.c.in b/src/s32-vor/s32-vor.c.in new file mode 100644 index 00000000000..53642151ce2 --- /dev/null +++ b/src/s32-vor/s32-vor.c.in @@ -0,0 +1,80 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" +$BATCH_TILES = tuple(int(bt) for bt in BATCH_TILES.split(",")) +$SIMD_SIZE = BATCH_TILES[0] +#include +#include +#include + +#include "xnnpack/simd/s32-${ARCH}.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + +$for BATCH_TILE in BATCH_TILES: + $assert BATCH_TILE % SIMD_SIZE == 0 + $assert BATCH_TILE >= SIMD_SIZE + $SIMD_TILE = BATCH_TILE // SIMD_SIZE + + void xnn_s32_vor_ukernel__${ARCH}_u${BATCH_TILE}( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) + { + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input_b != NULL); + assert(input_a != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == ${SIMD_SIZE}); + + $if SIMD_TILE > 1: + for (; batch >= ${BATCH_TILE} * sizeof(int32_t); batch -= ${BATCH_TILE} * sizeof(int32_t)) { + xnn_simd_s32_t vin1_${ABC[0]} = xnn_loadu_s32(input_a); + $for N in range(1, SIMD_TILE): + xnn_simd_s32_t vin1_${ABC[N]} = xnn_loadu_s32(input_a + ${N} * xnn_simd_size_s32); + input_a += ${BATCH_TILE}; + + xnn_simd_s32_t vin2_${ABC[0]} = xnn_loadu_s32(input_b); + $for N in range(1, SIMD_TILE): + xnn_simd_s32_t vin2_${ABC[N]} = (xnn_loadu_s32(input_b + ${N} * xnn_simd_size_s32)); + input_b += ${BATCH_TILE}; + + $for N in range(0, SIMD_TILE): + xnn_simd_s32_t vy_${ABC[N]} = xnn_or_s32(vin1_${ABC[N]}, vin2_${ABC[N]}); + + xnn_storeu_s32(output, vy_${ABC[0]}); + $for N in range(1, SIMD_TILE): + xnn_storeu_s32(output + ${N} * xnn_simd_size_s32, vy_${ABC[N]}); + output += ${BATCH_TILE}; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a); + input_a += xnn_simd_size_s32; + + xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b); + input_b += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + $if SIMD_SIZE > 1: + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } + } diff --git a/src/s32-vor/s32-vorc.c.in b/src/s32-vor/s32-vorc.c.in new file mode 100644 index 00000000000..fb433b92081 --- /dev/null +++ b/src/s32-vor/s32-vorc.c.in @@ -0,0 +1,73 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" +$BATCH_TILES = tuple(int(bt) for bt in BATCH_TILES.split(",")) +$SIMD_SIZE = BATCH_TILES[0] +#include +#include +#include + +#include "xnnpack/simd/s32-${ARCH}.h" + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" + +$for BATCH_TILE in BATCH_TILES: + $assert BATCH_TILE % SIMD_SIZE == 0 + $assert BATCH_TILE >= SIMD_SIZE + $SIMD_TILE = BATCH_TILE // SIMD_SIZE + + void xnn_s32_vorc_ukernel__${ARCH}_u${BATCH_TILE}( + size_t batch, + const int32_t* input1, + const int32_t* input2, + int32_t* output, + const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) + { + assert(batch != 0); + assert(batch % sizeof(int32_t) == 0); + assert(input1 != NULL); + assert(input2 != NULL); + assert(output != NULL); + assert(xnn_simd_size_s32 == ${SIMD_SIZE}); + + xnn_simd_s32_t vin2 = xnn_set1_s32(*input2); + + $if SIMD_TILE > 1: + for (; batch >= ${BATCH_TILE} * sizeof(int32_t); batch -= ${BATCH_TILE} * sizeof(int32_t)) { + + xnn_simd_s32_t vin1_${ABC[0]} = (xnn_loadu_s32(input1)); + $for N in range(1, SIMD_TILE): + xnn_simd_s32_t vin1_${ABC[N]} = (xnn_loadu_s32(input1 + ${N} * xnn_simd_size_s32)); + input1 += ${BATCH_TILE}; + + $for N in range(0, SIMD_TILE): + xnn_simd_s32_t vy_${ABC[N]} = xnn_or_s32(vin1_${ABC[N]}, vin2); + + xnn_storeu_s32(output, vy_${ABC[0]}); + $for N in range(1, SIMD_TILE): + xnn_storeu_s32(output + ${N} * xnn_simd_size_s32, vy_${ABC[N]}); + output += ${BATCH_TILE}; + } + for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) { + xnn_simd_s32_t vin1 = xnn_loadu_s32(input1); + input1 += xnn_simd_size_s32; + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_storeu_s32(output, vy); + output += xnn_simd_size_s32; + } + $if SIMD_SIZE > 1: + if XNN_UNLIKELY(batch != 0) { + xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T)); + + xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2); + + xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T); + } + } diff --git a/src/subgraph/or.c b/src/subgraph/or.c new file mode 100644 index 00000000000..d16e3d8667b --- /dev/null +++ b/src/subgraph/or.c @@ -0,0 +1,253 @@ +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include + +#include "xnnpack.h" +#include "xnnpack/common.h" +#include "xnnpack/log.h" +#include "xnnpack/node-type.h" +#include "xnnpack/operator-type.h" +#include "xnnpack/operator.h" +#include "xnnpack/reshape-helpers.h" +#include "xnnpack/subgraph-validation.h" +#include "xnnpack/subgraph.h" +#include "pthreadpool.h" + +static enum xnn_status create_or_operator( + const struct xnn_node* node, + const struct xnn_value* values, + size_t num_values, + struct xnn_operator_data* opdata, + struct xnn_code_cache* code_cache, + xnn_weights_cache_t weights_cache) +{ + assert(node->num_inputs == 2); + assert(node->num_outputs == 1); + + enum xnn_status status; + switch (node->compute_type) { + case xnn_compute_type_s32: + status = xnn_create_or_nd_s32( + node->flags, + &opdata->operator_objects[0]); + break; + default: + XNN_UNREACHABLE; + } + return status; +} + +static enum xnn_status reshape_or_operator( + struct xnn_operator_data* opdata, + struct xnn_value* values, + size_t num_values, + pthreadpool_t threadpool) +{ + const uint32_t input1_id = opdata->inputs[0]; + assert(input1_id < num_values); + const uint32_t input2_id = opdata->inputs[1]; + assert(input2_id < num_values); + const uint32_t output_id = opdata->outputs[0]; + assert(output_id < num_values); + + opdata->shape1.num_dims = values[input1_id].shape.num_dims; + opdata->shape2.num_dims = values[input2_id].shape.num_dims; + if (values[output_id].layout == xnn_layout_type_nchw) { + assert(values[input1_id].layout == xnn_layout_type_nchw); + assert(values[input2_id].layout == xnn_layout_type_nchw); + opdata->shape1.dim[0] = values[input1_id].shape.dim[0]; + opdata->shape1.dim[1] = values[input1_id].shape.dim[values[input1_id].shape.num_dims - 1]; + if (values[input1_id].shape.num_dims > 2) { + memcpy(&opdata->shape1.dim[2], &values[input1_id].shape.dim[1], (values[input1_id].shape.num_dims - 2) * sizeof(size_t)); + } + opdata->shape2.dim[0] = values[input2_id].shape.dim[0]; + opdata->shape2.dim[1] = values[input2_id].shape.dim[values[input2_id].shape.num_dims - 1]; + if (values[input1_id].shape.num_dims > 2) { + memcpy(&opdata->shape2.dim[2], &values[input2_id].shape.dim[1], (values[input2_id].shape.num_dims - 2) * sizeof(size_t)); + } + } else { + assert(values[output_id].layout == xnn_layout_type_nhwc); + assert(values[input1_id].layout == xnn_layout_type_nhwc); + assert(values[input2_id].layout == xnn_layout_type_nhwc); + memcpy(opdata->shape1.dim, values[input1_id].shape.dim, values[input1_id].shape.num_dims * sizeof(size_t)); + memcpy(opdata->shape2.dim, values[input2_id].shape.dim, values[input2_id].shape.num_dims * sizeof(size_t)); + } + + // Handle scalars. Although the output shape is dimensionless, the reshape + // function must be passed a valid shape to prevent skipping the op. + if (opdata->shape1.num_dims == 0) { + opdata->shape1.num_dims = 1; + opdata->shape1.dim[0] = 1; + } + if (opdata->shape2.num_dims == 0) { + opdata->shape2.num_dims = 1; + opdata->shape2.dim[0] = 1; + } + const size_t old_workspace_size = opdata->workspace_size; + enum xnn_status status = xnn_status_invalid_state; + switch (opdata->operator_objects[0]->type) { + case xnn_operator_type_or_nd_s32: + status = xnn_reshape_or_nd_s32( + opdata->operator_objects[0], + opdata->shape1.num_dims, + opdata->shape1.dim, + opdata->shape2.num_dims, + opdata->shape2.dim, + threadpool); + break; + default: + XNN_UNREACHABLE; + } + if (status != xnn_status_success) { + return status; + } + return resize_binary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); +} + +static enum xnn_status setup_or_operator( + const struct xnn_operator_data* opdata, + const struct xnn_value* values, + size_t num_values, + pthreadpool_t threadpool) +{ + const uint32_t input1_id = opdata->inputs[0]; + assert(input1_id != XNN_INVALID_VALUE_ID); + assert(input1_id < num_values); + + const uint32_t input2_id = opdata->inputs[1]; + assert(input2_id != XNN_INVALID_VALUE_ID); + assert(input2_id < num_values); + + const uint32_t output_id = opdata->outputs[0]; + assert(output_id != XNN_INVALID_VALUE_ID); + assert(output_id < num_values); + + const struct xnn_value* input1_value = values + input1_id; + const void* input1_data = input1_value->data; + assert(input1_data != NULL); + + const struct xnn_value* input2_value = values + input2_id; + const void* input2_data = input2_value->data; + assert(input2_data != NULL); + + const struct xnn_value* output_value = values + output_id; + void* output_data = output_value->data; + assert(output_data != NULL); + + switch (opdata->operator_objects[0]->type) { + case xnn_operator_type_or_nd_s32: + return xnn_setup_or_nd_s32( + opdata->operator_objects[0], + input1_data, input2_data, output_data); + default: + XNN_UNREACHABLE; + } +} + +enum xnn_status xnn_define_or( + xnn_subgraph_t subgraph, + uint32_t input1_id, + uint32_t input2_id, + uint32_t output_id, + uint32_t flags) +{ + enum xnn_status status; + if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_or)) != xnn_status_success) { + return status; + } + + if ((status = xnn_subgraph_check_nth_input_node_id(xnn_node_type_or, input1_id, subgraph->num_values, 1)) != + xnn_status_success) { + return status; + } + + const struct xnn_value* input1_value = &subgraph->values[input1_id]; + status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_or, input1_id, input1_value, 1); + if (status != xnn_status_success) { + return status; + } + + switch (input1_value->datatype) { + case xnn_datatype_int32: + break; + default: + xnn_log_error( + "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", + xnn_node_type_to_string(xnn_node_type_or), input1_id, + xnn_datatype_to_string(input1_value->datatype), input1_value->datatype); + return xnn_status_invalid_parameter; + } + + if ((status = xnn_subgraph_check_nth_input_node_id( + xnn_node_type_or, input2_id, subgraph->num_values, 2)) != xnn_status_success) { + return status; + } + + const struct xnn_value* input2_value = &subgraph->values[input2_id]; + status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_or, input2_id, input2_value, 2); + if (status != xnn_status_success) { + return status; + } + + switch (input2_value->datatype) { + case xnn_datatype_int32: + break; + default: + xnn_log_error( + "failed to define %s operator with the second input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", + xnn_node_type_to_string(xnn_node_type_or), input2_id, + xnn_datatype_to_string(input2_value->datatype), input2_value->datatype); + return xnn_status_invalid_parameter; + } + + status = xnn_subgraph_check_output_node_id(xnn_node_type_or, output_id, subgraph->num_values); + if (status != xnn_status_success) { + return status; + } + + const struct xnn_value* output_value = &subgraph->values[output_id]; + status = xnn_subgraph_check_output_type_dense(xnn_node_type_or, output_id, output_value); + if (status != xnn_status_success) { + return status; + } + + enum xnn_compute_type compute_type = xnn_compute_type_invalid; + switch (output_value->datatype) { + case xnn_datatype_int32: + compute_type = xnn_compute_type_s32; + break; + default: + xnn_log_error( + "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", + xnn_node_type_to_string(xnn_node_type_or), output_id, + xnn_datatype_to_string(output_value->datatype), output_value->datatype); + return xnn_status_invalid_parameter; + } + + struct xnn_node* node = xnn_subgraph_new_node(subgraph); + if (node == NULL) { + return xnn_status_out_of_memory; + } + + node->type = xnn_node_type_or; + node->compute_type = compute_type; + node->num_inputs = 2; + node->inputs[0] = input1_id; + node->inputs[1] = input2_id; + node->num_outputs = 1; + node->outputs[0] = output_id; + node->flags = flags; + + node->create = create_or_operator; + node->reshape = reshape_or_operator; + node->setup = setup_or_operator; + + return xnn_status_success; +} diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h index 3f1d2b599de..382e344b3e7 100644 --- a/src/xnnpack/config.h +++ b/src/xnnpack/config.h @@ -35,6 +35,7 @@ XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vsqrdiff_c XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vadd_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vcopysign_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_s32_vmul_config(); +XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_s32_vor_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vdiv_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vmax_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vmin_config(); diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index d11ab8e6e60..83e0879185c 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -1980,6 +1980,15 @@ typedef void (*xnn_f32_vneg_ukernel_fn)( float* output, const union xnn_f32_default_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); +// VOR: Vector OR elementwise + +typedef void (*xnn_s32_vor_ukernel_fn)( + size_t batch, + const int32_t* input_a, + const int32_t* input_b, + int32_t* output, + const union xnn_s32_default_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + // VRELU: Vector REctified Linear Unit elementwise typedef void (*xnn_f32_vrelu_ukernel_fn)( diff --git a/src/xnnpack/node-type.h b/src/xnnpack/node-type.h index a0100109e8e..6401efa09f3 100644 --- a/src/xnnpack/node-type.h +++ b/src/xnnpack/node-type.h @@ -59,6 +59,7 @@ enum xnn_node_type { xnn_node_type_minimum2, xnn_node_type_multiply2, xnn_node_type_negate, + xnn_node_type_or, xnn_node_type_prelu, xnn_node_type_reciprocal_square_root, xnn_node_type_reshape_2d, diff --git a/src/xnnpack/operator-type.h b/src/xnnpack/operator-type.h index c0ecfdb3200..2c2c0a32adf 100644 --- a/src/xnnpack/operator-type.h +++ b/src/xnnpack/operator-type.h @@ -138,6 +138,7 @@ enum xnn_operator_type { xnn_operator_type_multiply_nd_s32, xnn_operator_type_negate_nc_f16, xnn_operator_type_negate_nc_f32, + xnn_operator_type_or_nd_s32, xnn_operator_type_prelu_nc_f16, xnn_operator_type_prelu_nc_f32, xnn_operator_type_reciprocal_square_root_nc_f16, diff --git a/src/xnnpack/simd/s32-avx2.h b/src/xnnpack/simd/s32-avx2.h index 7a4fdd7ff2f..24d8567ac56 100644 --- a/src/xnnpack/simd/s32-avx2.h +++ b/src/xnnpack/simd/s32-avx2.h @@ -43,6 +43,13 @@ static XNN_INLINE xnn_simd_s32_t xnn_min_s32(xnn_simd_s32_t a, return _mm256_min_epi32(a, b); } +// Bitwise Operations + +static XNN_INLINE xnn_simd_s32_t xnn_or_s32(xnn_simd_s32_t a, + xnn_simd_s32_t b) { + return _mm256_or_si256(a, b); +} + // Load/store operations. static XNN_INLINE xnn_simd_s32_t xnn_loadu_s32(const int32_t* ptr) { diff --git a/src/xnnpack/simd/s32-avx512f.h b/src/xnnpack/simd/s32-avx512f.h index ec89f56cc19..5bcbaa0bf24 100644 --- a/src/xnnpack/simd/s32-avx512f.h +++ b/src/xnnpack/simd/s32-avx512f.h @@ -42,22 +42,29 @@ static XNN_INLINE xnn_simd_s32_t xnn_min_s32(xnn_simd_s32_t a, return _mm512_min_epi32(a, b); } +// Bitwise operations + +static XNN_INLINE xnn_simd_s32_t xnn_or_s32(xnn_simd_s32_t a, + xnn_simd_s32_t b) { + return _mm512_or_epi32(a, b); +} + // Load/store operations. static XNN_INLINE xnn_simd_s32_t xnn_loadu_s32(const int32_t* ptr) { - return _mm512_loadu_epi32(ptr); + return _mm512_loadu_si512(ptr); } static XNN_INLINE xnn_simd_s32_t xnn_load_s32(const int32_t* ptr) { - return _mm512_load_epi32(ptr); + return _mm512_load_si512(ptr); } static XNN_INLINE void xnn_storeu_s32(int32_t* ptr, xnn_simd_s32_t v) { - _mm512_storeu_epi32(ptr, v); + _mm512_storeu_si512(ptr, v); } static XNN_INLINE void xnn_store_s32(float* ptr, xnn_simd_s32_t v) { - _mm512_store_epi32(ptr, v); + _mm512_store_si512(ptr, v); } static XNN_INLINE xnn_simd_s32_t xnn_set1_s32(int32_t v) { diff --git a/src/xnnpack/simd/s32-neon.h b/src/xnnpack/simd/s32-neon.h index 62aa2b70f50..b4354c8b634 100644 --- a/src/xnnpack/simd/s32-neon.h +++ b/src/xnnpack/simd/s32-neon.h @@ -38,6 +38,13 @@ static XNN_INLINE xnn_simd_s32_t xnn_min_s32(xnn_simd_s32_t a, return vminq_s32(a, b); } +// Bitwsie operations + +static XNN_INLINE xnn_simd_s32_t xnn_or_s32(xnn_simd_s32_t a, + xnn_simd_s32_t b) { + return vorrq_s32(a, b); +} + // Load/store operations. static XNN_INLINE xnn_simd_s32_t xnn_loadu_s32(const int32_t* ptr) { return vld1q_s32(ptr); diff --git a/src/xnnpack/simd/s32-scalar.h b/src/xnnpack/simd/s32-scalar.h index 55c1fc53d12..aa625135796 100644 --- a/src/xnnpack/simd/s32-scalar.h +++ b/src/xnnpack/simd/s32-scalar.h @@ -39,6 +39,13 @@ static XNN_INLINE xnn_simd_s32_t xnn_min_s32(xnn_simd_s32_t a, return (a < b) ? a : b; } +// Bitwise operations + +static XNN_INLINE xnn_simd_s32_t xnn_or_s32(xnn_simd_s32_t a, + xnn_simd_s32_t b) { + return (a | b); +} + static XNN_INLINE xnn_simd_s32_t xnn_loadu_s32(const int32_t *ptr) { return *ptr; } static XNN_INLINE xnn_simd_s32_t xnn_load_s32(const int32_t *ptr) { return *ptr; } diff --git a/src/xnnpack/simd/s32-sse41.h b/src/xnnpack/simd/s32-sse41.h index c453f097506..ea03b638783 100644 --- a/src/xnnpack/simd/s32-sse41.h +++ b/src/xnnpack/simd/s32-sse41.h @@ -41,6 +41,13 @@ static XNN_INLINE xnn_simd_s32_t xnn_min_s32(xnn_simd_s32_t a, return _mm_min_epi32(a, b); } +// Bitwise operations + +static XNN_INLINE xnn_simd_s32_t xnn_or_s32(xnn_simd_s32_t a, + xnn_simd_s32_t b) { + return _mm_or_si128(a, b); +} + // Load/store operations. static XNN_INLINE xnn_simd_s32_t xnn_loadu_s32(const int32_t* ptr) { diff --git a/src/xnnpack/simd/s32-wasmsimd.h b/src/xnnpack/simd/s32-wasmsimd.h index 96e2c836252..141d952487a 100644 --- a/src/xnnpack/simd/s32-wasmsimd.h +++ b/src/xnnpack/simd/s32-wasmsimd.h @@ -41,6 +41,13 @@ static XNN_INLINE xnn_simd_s32_t xnn_min_s32(xnn_simd_s32_t a, return wasm_i32x4_min(a, b); } +// Bitwise operations + +static XNN_INLINE xnn_simd_s32_t xnn_or_s32(xnn_simd_s32_t a, + xnn_simd_s32_t b) { + return wasm_v128_or(a, b); +} + // Load/store operations. static XNN_INLINE xnn_simd_s32_t xnn_loadu_s32(const int32_t* ptr) { diff --git a/src/xnnpack/vbinary.h b/src/xnnpack/vbinary.h index 8b11dd26dc7..b8b0ce4d43f 100644 --- a/src/xnnpack/vbinary.h +++ b/src/xnnpack/vbinary.h @@ -1339,6 +1339,56 @@ DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vmulc_ukernel__wasmsimd_u8) DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vmulc_ukernel__wasmsimd_u12) DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vmulc_ukernel__wasmsimd_u16) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx2_u8) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx2_u16) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx2_u24) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx2_u32) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx512f_u16) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx512f_u32) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx512f_u48) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx512f_u64) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__neon_u4) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__neon_u8) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__neon_u12) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__neon_u16) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__scalar_u1) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__scalar_u2) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__scalar_u4) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__scalar_u8) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__sse41_u4) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__sse41_u8) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__sse41_u12) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__sse41_u16) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__wasmsimd_u4) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__wasmsimd_u8) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__wasmsimd_u12) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__wasmsimd_u16) + +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx2_u8) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx2_u16) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx2_u24) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx2_u32) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx512f_u16) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx512f_u32) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx512f_u48) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx512f_u64) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__neon_u4) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__neon_u8) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__neon_u12) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__neon_u16) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__scalar_u1) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__scalar_u2) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__scalar_u4) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__scalar_u8) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__sse41_u4) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__sse41_u8) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__sse41_u12) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__sse41_u16) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__wasmsimd_u4) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__wasmsimd_u8) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__wasmsimd_u12) +DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__wasmsimd_u16) + #ifdef __cplusplus } // extern "C" #endif diff --git a/test/BUILD.bazel b/test/BUILD.bazel index c7d5d3afab6..f8e6a59b935 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -291,6 +291,7 @@ xnnpack_cc_library( "qu8_vmul_minmax_fp32", "qu8_vmul_minmax_rndnu", "s32_vmul", + "s32_vor", ]] [xnnpack_unit_test( @@ -339,6 +340,7 @@ xnnpack_cc_library( "qu8_vmulc_minmax_fp32", "qu8_vmulc_minmax_rndnu", "s32_vmulc", + "s32_vorc", ]] [xnnpack_unit_test( @@ -1584,6 +1586,7 @@ xnnpack_binary( "squared_difference_nd_eager", "subtract_nd", "subtract_nd_eager", + "or_nd" ]] xnnpack_unit_test( @@ -2092,6 +2095,7 @@ xnnpack_unit_test( "maximum2", "minimum2", "multiply2", + "or", "squared_difference", "subtract2", ]] diff --git a/test/binary-elementwise-operator-tester.cc b/test/binary-elementwise-operator-tester.cc index b19a07c781b..a2eb0a39e02 100644 --- a/test/binary-elementwise-operator-tester.cc +++ b/test/binary-elementwise-operator-tester.cc @@ -809,6 +809,10 @@ void BinaryElementwiseOperatorTester::TestS32() const { ASSERT_EQ(xnn_status_success, xnn_create_multiply_nd_s32(0, &binary_elementwise_op)); break; + case OperationType::OR: + ASSERT_EQ(xnn_status_success, + xnn_create_or_nd_s32(0, &binary_elementwise_op)); + break; default: FAIL() << "Unsupported operation type"; } @@ -830,6 +834,17 @@ void BinaryElementwiseOperatorTester::TestS32() const { binary_elementwise_op, input1.data(), input2.data(), output.data())); break; + case OperationType::OR: + ASSERT_EQ( + xnn_status_success, + xnn_reshape_or_nd_s32( + binary_elementwise_op, num_input1_dims(), input1_shape().data(), + num_input2_dims(), input2_shape().data(), + /*threadpool=*/nullptr)); + ASSERT_EQ(xnn_status_success, xnn_setup_or_nd_s32( + binary_elementwise_op, input1.data(), + input2.data(), output.data())); + break; default: FAIL() << "Unsupported operation type"; } diff --git a/test/binary-elementwise-operator-tester.h b/test/binary-elementwise-operator-tester.h index 6b46730e79b..df93710f4a3 100644 --- a/test/binary-elementwise-operator-tester.h +++ b/test/binary-elementwise-operator-tester.h @@ -30,6 +30,7 @@ class BinaryElementwiseOperatorTester { Maximum, Minimum, Multiply, + OR, Subtract, SquaredDifference, }; @@ -194,6 +195,8 @@ class BinaryElementwiseOperatorTester { return std::min(a, b); case OperationType::Multiply: return a * b; + case OperationType::OR: + return a | b; case OperationType::Subtract: return a - b; case OperationType::SquaredDifference: diff --git a/test/or-nd.cc b/test/or-nd.cc new file mode 100644 index 00000000000..bc8be75b35e --- /dev/null +++ b/test/or-nd.cc @@ -0,0 +1,1157 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "binary-elementwise-operator-tester.h" +#include + +constexpr size_t kDim1 = 2; +constexpr size_t kDim2 = 3; +constexpr size_t kDim3 = 4; +constexpr size_t kDim4 = 5; +constexpr size_t kDim5 = 6; +constexpr size_t kDim6 = 7; + + +TEST(OR_ND_S32, or_0d_x_0d) { + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .TestS32(); +} + +TEST(OR_ND_S32, or_1d_x_0d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim1}) + .TestS32(); + } +} + +TEST(OR_ND_S32, or_0d_x_1d) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) { + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input2_shape({input2_dim1}) + .TestS32(); + } +} + +TEST(OR_ND_S32, or_1d_x_1d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim1}) + .input2_shape({input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_0d_x_2d) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) { + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input2_shape({input2_dim2, input2_dim1}) + .TestS32(); + } +} + +TEST(OR_ND_S32, or_1d_x_2d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim1}) + .input2_shape({input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_2d_x_0d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim2, input1_dim1}) + .TestS32(); + } +} + +TEST(OR_ND_S32, or_2d_x_1d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim2, input1_dim1}) + .input2_shape({input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_2d_x_2d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim2, input1_dim1}) + .input2_shape({input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_0d_x_3d) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) { + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input2_shape({input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } +} + +TEST(OR_ND_S32, or_1d_x_3d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim1}) + .input2_shape({input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_2d_x_3d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim2, input1_dim1}) + .input2_shape({input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_3d_x_0d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim3, input1_dim2, input1_dim1}) + .TestS32(); + } +} + +TEST(OR_ND_S32, or_3d_x_1d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_3d_x_2d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_3d_x_3d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_0d_x_4d) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) { + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } +} + +TEST(OR_ND_S32, or_1d_x_4d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim1}) + .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_2d_x_4d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim2, input1_dim1}) + .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_3d_x_4d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_4d_x_0d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .TestS32(); + } +} + +TEST(OR_ND_S32, or_4d_x_1d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_4d_x_2d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_4d_x_3d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_4d_x_4d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_0d_x_5d) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) { + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } +} + +TEST(OR_ND_S32, or_1d_x_5d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim1}) + .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_2d_x_5d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim2, input1_dim1}) + .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_3d_x_5d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_4d_x_5d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_5d_x_0d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .TestS32(); + } +} + +TEST(OR_ND_S32, or_5d_x_1d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_5d_x_2d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_5d_x_3d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_5d_x_4d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_5d_x_5d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .iterations(1) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_0d_x_6d) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) { + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } +} + +TEST(OR_ND_S32, or_1d_x_6d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim1}) + .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_2d_x_6d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim2, input1_dim1}) + .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_3d_x_6d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_4d_x_6d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_5d_x_6d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .iterations(1) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_6d_x_0d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .TestS32(); + } +} + +TEST(OR_ND_S32, or_6d_x_1d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_6d_x_2d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_6d_x_3d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_6d_x_4d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_6d_x_5d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .iterations(1) + .TestS32(); + } + } +} + +TEST(OR_ND_S32, or_6d_x_6d) { + for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) { + for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) { + const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0; + const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0; + const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0; + const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0; + const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0; + const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0; + const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0; + const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0; + const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0; + const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0; + const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0; + const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0; + const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1; + const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2; + const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3; + const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4; + const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5; + const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6; + const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1; + const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2; + const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3; + const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4; + const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5; + const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6; + BinaryElementwiseOperatorTester() + .operation_type(BinaryElementwiseOperatorTester::OperationType::OR) + .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1}) + .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1}) + .iterations(1) + .TestS32(); + } + } +} diff --git a/test/or.cc b/test/or.cc new file mode 100644 index 00000000000..94057777969 --- /dev/null +++ b/test/or.cc @@ -0,0 +1,138 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include "xnnpack.h" +#include "xnnpack/node-type.h" +#include "xnnpack/operator.h" +#include "xnnpack/subgraph.h" +#include "subgraph-binary-tester.h" + +using OrS32 = BinaryTest; + +TEST_F(OrS32, define) { + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + std::vector dims = RandomShape(); + + uint32_t input1_id = XNN_INVALID_NODE_ID; + ASSERT_EQ( + xnn_status_success, xnn_define_tensor_value( + subgraph, xnn_datatype_int32, dims.size(), dims.data(), nullptr, + /*external_id=*/0, /*flags=*/0, &input1_id)); + ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); + + uint32_t input2_id = XNN_INVALID_NODE_ID; + ASSERT_EQ( + xnn_status_success, xnn_define_tensor_value( + subgraph, xnn_datatype_int32, dims.size(), dims.data(), nullptr, + /*external_id=*/0, /*flags=*/0, &input2_id)); + ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ( + xnn_status_success, + xnn_define_tensor_value( + subgraph, xnn_datatype_int32, dims.size(), dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ( + xnn_status_success, + xnn_define_or(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); + + ASSERT_EQ(subgraph->num_nodes, 1); + const struct xnn_node* node = &subgraph->nodes[0]; + ASSERT_EQ(node->type, xnn_node_type_or); + ASSERT_EQ(node->compute_type, xnn_compute_type_s32); + ASSERT_EQ(node->num_inputs, 2); + ASSERT_EQ(node->inputs[0], input1_id); + ASSERT_EQ(node->inputs[1], input2_id); + ASSERT_EQ(node->num_outputs, 1); + ASSERT_EQ(node->outputs[0], output_id); + ASSERT_EQ(node->flags, 0); +} + +TEST_F(OrS32, matches_operator_api) +{ + std::generate(input1.begin(), input1.end(), [&]() { return s32dist(rng); }); + std::generate(input2.begin(), input2.end(), [&]() { return s32dist(rng); }); + std::fill(operator_output.begin(), operator_output.end(), INT_MAX); + std::fill(subgraph_output.begin(), subgraph_output.end(), INT_MAX); + + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_operator_t op = nullptr; + + // Call operator API. + ASSERT_EQ(xnn_status_success, xnn_create_or_nd_s32(/*flags=*/0, &op)); + std::unique_ptr auto_op(op, xnn_delete_operator); + + ASSERT_EQ( + xnn_status_success, xnn_reshape_or_nd_s32( + op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(), + /*threadpool=*/nullptr)); + + ASSERT_EQ( + xnn_status_success, xnn_setup_or_nd_s32(op, input1.data(), input2.data(), operator_output.data())); + + ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + uint32_t input1_id = XNN_INVALID_NODE_ID; + ASSERT_EQ( + xnn_status_success, xnn_define_tensor_value( + subgraph, xnn_datatype_int32, input1_dims.size(), input1_dims.data(), nullptr, + /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id)); + ASSERT_NE(input1_id, XNN_INVALID_NODE_ID); + + uint32_t input2_id = XNN_INVALID_NODE_ID; + ASSERT_EQ( + xnn_status_success, xnn_define_tensor_value( + subgraph, xnn_datatype_int32, input2_dims.size(), input2_dims.data(), nullptr, + /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id)); + ASSERT_NE(input2_id, XNN_INVALID_NODE_ID); + + uint32_t output_id = XNN_INVALID_NODE_ID; + ASSERT_EQ( + xnn_status_success, + xnn_define_tensor_value( + subgraph, xnn_datatype_int32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_NODE_ID); + + ASSERT_EQ( + xnn_status_success, + xnn_define_or(subgraph, input1_id, input2_id, output_id, /*flags=*/0)); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + std::array external = { + xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()}, + xnn_external_value{output_id, subgraph_output.data()}}; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + ASSERT_EQ(subgraph_output, operator_output); +} diff --git a/test/s32-vor.cc b/test/s32-vor.cc new file mode 100644 index 00000000000..33533ba9619 --- /dev/null +++ b/test/s32-vor.cc @@ -0,0 +1,1568 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! +// Specification: test/s32-vor.yaml +// Generator: tools/generate-vbinary-test.py + + +#include +#include "xnnpack/common.h" +#include "xnnpack/isa-checks.h" +#include "xnnpack/microparams-init.h" +#include "xnnpack/vbinary.h" +#include "vbinary-microkernel-tester.h" + + +TEST(S32_VOR__SCALAR_U1, batch_eq_1) { + VBinaryMicrokernelTester() + .batch_size(1) + .Test(xnn_s32_vor_ukernel__scalar_u1, VBinaryMicrokernelTester::OpType::OR); +} + +TEST(S32_VOR__SCALAR_U1, batch_gt_1) { + for (size_t batch_size = 2; batch_size < 10; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__scalar_u1, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U1, inplace_a) { + for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__scalar_u1, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U1, inplace_b) { + for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__scalar_u1, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U1, inplace_a_and_b) { + for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__scalar_u1, VBinaryMicrokernelTester::OpType::OR); + } +} + + +TEST(S32_VOR__SCALAR_U2, batch_eq_2) { + VBinaryMicrokernelTester() + .batch_size(2) + .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR); +} + +TEST(S32_VOR__SCALAR_U2, batch_div_2) { + for (size_t batch_size = 4; batch_size < 20; batch_size += 2) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U2, batch_lt_2) { + for (size_t batch_size = 1; batch_size < 2; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U2, batch_gt_2) { + for (size_t batch_size = 3; batch_size < 4; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U2, inplace_a) { + for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U2, inplace_b) { + for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U2, inplace_a_and_b) { + for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR); + } +} + + +TEST(S32_VOR__SCALAR_U4, batch_eq_4) { + VBinaryMicrokernelTester() + .batch_size(4) + .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR); +} + +TEST(S32_VOR__SCALAR_U4, batch_div_4) { + for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U4, batch_lt_4) { + for (size_t batch_size = 1; batch_size < 4; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U4, batch_gt_4) { + for (size_t batch_size = 5; batch_size < 8; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U4, inplace_a) { + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U4, inplace_b) { + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U4, inplace_a_and_b) { + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR); + } +} + + +TEST(S32_VOR__SCALAR_U8, batch_eq_8) { + VBinaryMicrokernelTester() + .batch_size(8) + .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR); +} + +TEST(S32_VOR__SCALAR_U8, batch_div_8) { + for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U8, batch_lt_8) { + for (size_t batch_size = 1; batch_size < 8; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U8, batch_gt_8) { + for (size_t batch_size = 9; batch_size < 16; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U8, inplace_a) { + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U8, inplace_b) { + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR); + } +} + +TEST(S32_VOR__SCALAR_U8, inplace_a_and_b) { + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR); + } +} + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VOR__SSE41_U4, batch_eq_4) { + TEST_REQUIRES_X86_SSE41; + VBinaryMicrokernelTester() + .batch_size(4) + .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__SSE41_U4, batch_div_4) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U4, batch_lt_4) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size < 4; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U4, batch_gt_4) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 5; batch_size < 8; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U4, inplace_a) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U4, inplace_b) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U4, inplace_a_and_b) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VOR__SSE41_U8, batch_eq_8) { + TEST_REQUIRES_X86_SSE41; + VBinaryMicrokernelTester() + .batch_size(8) + .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__SSE41_U8, batch_div_8) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U8, batch_lt_8) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size < 8; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U8, batch_gt_8) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 9; batch_size < 16; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U8, inplace_a) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U8, inplace_b) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U8, inplace_a_and_b) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VOR__SSE41_U12, batch_eq_12) { + TEST_REQUIRES_X86_SSE41; + VBinaryMicrokernelTester() + .batch_size(12) + .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__SSE41_U12, batch_div_12) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 24; batch_size < 120; batch_size += 12) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U12, batch_lt_12) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size < 12; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U12, batch_gt_12) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 13; batch_size < 24; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U12, inplace_a) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U12, inplace_b) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U12, inplace_a_and_b) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VOR__SSE41_U16, batch_eq_16) { + TEST_REQUIRES_X86_SSE41; + VBinaryMicrokernelTester() + .batch_size(16) + .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__SSE41_U16, batch_div_16) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U16, batch_lt_16) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size < 16; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U16, batch_gt_16) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 17; batch_size < 32; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U16, inplace_a) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U16, inplace_b) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__SSE41_U16, inplace_a_and_b) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VOR__AVX2_U8, batch_eq_8) { + TEST_REQUIRES_X86_AVX2; + VBinaryMicrokernelTester() + .batch_size(8) + .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__AVX2_U8, batch_div_8) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U8, batch_lt_8) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size < 8; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U8, batch_gt_8) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 9; batch_size < 16; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U8, inplace_a) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U8, inplace_b) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U8, inplace_a_and_b) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VOR__AVX2_U16, batch_eq_16) { + TEST_REQUIRES_X86_AVX2; + VBinaryMicrokernelTester() + .batch_size(16) + .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__AVX2_U16, batch_div_16) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U16, batch_lt_16) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size < 16; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U16, batch_gt_16) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 17; batch_size < 32; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U16, inplace_a) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U16, inplace_b) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U16, inplace_a_and_b) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VOR__AVX2_U24, batch_eq_24) { + TEST_REQUIRES_X86_AVX2; + VBinaryMicrokernelTester() + .batch_size(24) + .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__AVX2_U24, batch_div_24) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U24, batch_lt_24) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size < 24; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U24, batch_gt_24) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 25; batch_size < 48; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U24, inplace_a) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U24, inplace_b) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U24, inplace_a_and_b) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VOR__AVX2_U32, batch_eq_32) { + TEST_REQUIRES_X86_AVX2; + VBinaryMicrokernelTester() + .batch_size(32) + .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__AVX2_U32, batch_div_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U32, batch_lt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size < 32; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U32, batch_gt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 33; batch_size < 64; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U32, inplace_a) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U32, inplace_b) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX2_U32, inplace_a_and_b) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VOR__AVX512F_U16, batch_eq_16) { + TEST_REQUIRES_X86_AVX512F; + VBinaryMicrokernelTester() + .batch_size(16) + .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__AVX512F_U16, batch_div_16) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U16, batch_lt_16) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size < 16; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U16, batch_gt_16) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 17; batch_size < 32; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U16, inplace_a) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U16, inplace_b) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U16, inplace_a_and_b) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VOR__AVX512F_U32, batch_eq_32) { + TEST_REQUIRES_X86_AVX512F; + VBinaryMicrokernelTester() + .batch_size(32) + .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__AVX512F_U32, batch_div_32) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U32, batch_lt_32) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size < 32; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U32, batch_gt_32) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 33; batch_size < 64; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U32, inplace_a) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U32, inplace_b) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U32, inplace_a_and_b) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VOR__AVX512F_U48, batch_eq_48) { + TEST_REQUIRES_X86_AVX512F; + VBinaryMicrokernelTester() + .batch_size(48) + .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__AVX512F_U48, batch_div_48) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 96; batch_size < 480; batch_size += 48) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U48, batch_lt_48) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size < 48; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U48, batch_gt_48) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 49; batch_size < 96; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U48, inplace_a) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 240; batch_size += 47) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U48, inplace_b) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 240; batch_size += 47) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U48, inplace_a_and_b) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 240; batch_size += 47) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VOR__AVX512F_U64, batch_eq_64) { + TEST_REQUIRES_X86_AVX512F; + VBinaryMicrokernelTester() + .batch_size(64) + .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__AVX512F_U64, batch_div_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 128; batch_size < 640; batch_size += 64) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U64, batch_lt_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size < 64; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U64, batch_gt_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 65; batch_size < 128; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U64, inplace_a) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 320; batch_size += 63) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U64, inplace_b) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 320; batch_size += 63) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__AVX512F_U64, inplace_a_and_b) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 320; batch_size += 63) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + TEST(S32_VOR__WASMSIMD_U4, batch_eq_4) { + VBinaryMicrokernelTester() + .batch_size(4) + .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__WASMSIMD_U4, batch_div_4) { + for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U4, batch_lt_4) { + for (size_t batch_size = 1; batch_size < 4; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U4, batch_gt_4) { + for (size_t batch_size = 5; batch_size < 8; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U4, inplace_a) { + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U4, inplace_b) { + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U4, inplace_a_and_b) { + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + TEST(S32_VOR__WASMSIMD_U8, batch_eq_8) { + VBinaryMicrokernelTester() + .batch_size(8) + .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__WASMSIMD_U8, batch_div_8) { + for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U8, batch_lt_8) { + for (size_t batch_size = 1; batch_size < 8; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U8, batch_gt_8) { + for (size_t batch_size = 9; batch_size < 16; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U8, inplace_a) { + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U8, inplace_b) { + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U8, inplace_a_and_b) { + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + TEST(S32_VOR__WASMSIMD_U12, batch_eq_12) { + VBinaryMicrokernelTester() + .batch_size(12) + .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__WASMSIMD_U12, batch_div_12) { + for (size_t batch_size = 24; batch_size < 120; batch_size += 12) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U12, batch_lt_12) { + for (size_t batch_size = 1; batch_size < 12; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U12, batch_gt_12) { + for (size_t batch_size = 13; batch_size < 24; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U12, inplace_a) { + for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U12, inplace_b) { + for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U12, inplace_a_and_b) { + for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + TEST(S32_VOR__WASMSIMD_U16, batch_eq_16) { + VBinaryMicrokernelTester() + .batch_size(16) + .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__WASMSIMD_U16, batch_div_16) { + for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U16, batch_lt_16) { + for (size_t batch_size = 1; batch_size < 16; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U16, batch_gt_16) { + for (size_t batch_size = 17; batch_size < 32; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U16, inplace_a) { + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U16, inplace_b) { + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__WASMSIMD_U16, inplace_a_and_b) { + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 + TEST(S32_VOR__NEON_U4, batch_eq_4) { + TEST_REQUIRES_ARM_NEON; + VBinaryMicrokernelTester() + .batch_size(4) + .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__NEON_U4, batch_div_4) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U4, batch_lt_4) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size < 4; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U4, batch_gt_4) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 5; batch_size < 8; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U4, inplace_a) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U4, inplace_b) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U4, inplace_a_and_b) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 + TEST(S32_VOR__NEON_U8, batch_eq_8) { + TEST_REQUIRES_ARM_NEON; + VBinaryMicrokernelTester() + .batch_size(8) + .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__NEON_U8, batch_div_8) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U8, batch_lt_8) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size < 8; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U8, batch_gt_8) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 9; batch_size < 16; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U8, inplace_a) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U8, inplace_b) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U8, inplace_a_and_b) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 + TEST(S32_VOR__NEON_U12, batch_eq_12) { + TEST_REQUIRES_ARM_NEON; + VBinaryMicrokernelTester() + .batch_size(12) + .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__NEON_U12, batch_div_12) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 24; batch_size < 120; batch_size += 12) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U12, batch_lt_12) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size < 12; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U12, batch_gt_12) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 13; batch_size < 24; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U12, inplace_a) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U12, inplace_b) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U12, inplace_a_and_b) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 + TEST(S32_VOR__NEON_U16, batch_eq_16) { + TEST_REQUIRES_ARM_NEON; + VBinaryMicrokernelTester() + .batch_size(16) + .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR); + } + + TEST(S32_VOR__NEON_U16, batch_div_16) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U16, batch_lt_16) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size < 16; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U16, batch_gt_16) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 17; batch_size < 32; batch_size++) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U16, inplace_a) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U16, inplace_b) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR); + } + } + + TEST(S32_VOR__NEON_U16, inplace_a_and_b) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryMicrokernelTester() + .batch_size(batch_size) + .inplace_a(true) + .inplace_b(true) + .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR); + } + } +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/test/s32-vor.yaml b/test/s32-vor.yaml new file mode 100644 index 00000000000..a31e3d719a5 --- /dev/null +++ b/test/s32-vor.yaml @@ -0,0 +1,40 @@ +# Copyright 2024 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Scalar +- name: xnn_s32_vor_ukernel__scalar_u1 +- name: xnn_s32_vor_ukernel__scalar_u2 +- name: xnn_s32_vor_ukernel__scalar_u4 +- name: xnn_s32_vor_ukernel__scalar_u8 + +# x86 SSE41 +- name: xnn_s32_vor_ukernel__sse41_u4 +- name: xnn_s32_vor_ukernel__sse41_u8 +- name: xnn_s32_vor_ukernel__sse41_u12 +- name: xnn_s32_vor_ukernel__sse41_u16 + +# x86 AVX2 +- name: xnn_s32_vor_ukernel__avx2_u8 +- name: xnn_s32_vor_ukernel__avx2_u16 +- name: xnn_s32_vor_ukernel__avx2_u24 +- name: xnn_s32_vor_ukernel__avx2_u32 + +# x86 AVX512F +- name: xnn_s32_vor_ukernel__avx512f_u16 +- name: xnn_s32_vor_ukernel__avx512f_u32 +- name: xnn_s32_vor_ukernel__avx512f_u48 +- name: xnn_s32_vor_ukernel__avx512f_u64 + +# Wasm SIMD +- name: xnn_s32_vor_ukernel__wasmsimd_u4 +- name: xnn_s32_vor_ukernel__wasmsimd_u8 +- name: xnn_s32_vor_ukernel__wasmsimd_u12 +- name: xnn_s32_vor_ukernel__wasmsimd_u16 + +# ARM NEON +- name: xnn_s32_vor_ukernel__neon_u4 +- name: xnn_s32_vor_ukernel__neon_u8 +- name: xnn_s32_vor_ukernel__neon_u12 +- name: xnn_s32_vor_ukernel__neon_u16 diff --git a/test/s32-vorc.cc b/test/s32-vorc.cc new file mode 100644 index 00000000000..87e01d4ded5 --- /dev/null +++ b/test/s32-vorc.cc @@ -0,0 +1,1080 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! +// Specification: test/s32-vorc.yaml +// Generator: tools/generate-vbinary-test.py + + +#include +#include "xnnpack/common.h" +#include "xnnpack/isa-checks.h" +#include "xnnpack/microparams-init.h" +#include "xnnpack/vbinary.h" +#include "vbinaryc-microkernel-tester.h" + + +TEST(S32_VORC__SCALAR_U1, batch_eq_1) { + VBinaryCMicrokernelTester() + .batch_size(1) + .Test(xnn_s32_vorc_ukernel__scalar_u1, VBinaryCMicrokernelTester::OpType::ORC); +} + +TEST(S32_VORC__SCALAR_U1, batch_gt_1) { + for (size_t batch_size = 2; batch_size < 10; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__scalar_u1, VBinaryCMicrokernelTester::OpType::ORC); + } +} + +TEST(S32_VORC__SCALAR_U1, inplace) { + for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__scalar_u1, VBinaryCMicrokernelTester::OpType::ORC); + } +} + + +TEST(S32_VORC__SCALAR_U2, batch_eq_2) { + VBinaryCMicrokernelTester() + .batch_size(2) + .Test(xnn_s32_vorc_ukernel__scalar_u2, VBinaryCMicrokernelTester::OpType::ORC); +} + +TEST(S32_VORC__SCALAR_U2, batch_div_2) { + for (size_t batch_size = 4; batch_size < 20; batch_size += 2) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__scalar_u2, VBinaryCMicrokernelTester::OpType::ORC); + } +} + +TEST(S32_VORC__SCALAR_U2, batch_lt_2) { + for (size_t batch_size = 1; batch_size < 2; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__scalar_u2, VBinaryCMicrokernelTester::OpType::ORC); + } +} + +TEST(S32_VORC__SCALAR_U2, batch_gt_2) { + for (size_t batch_size = 3; batch_size < 4; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__scalar_u2, VBinaryCMicrokernelTester::OpType::ORC); + } +} + +TEST(S32_VORC__SCALAR_U2, inplace) { + for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__scalar_u2, VBinaryCMicrokernelTester::OpType::ORC); + } +} + + +TEST(S32_VORC__SCALAR_U4, batch_eq_4) { + VBinaryCMicrokernelTester() + .batch_size(4) + .Test(xnn_s32_vorc_ukernel__scalar_u4, VBinaryCMicrokernelTester::OpType::ORC); +} + +TEST(S32_VORC__SCALAR_U4, batch_div_4) { + for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__scalar_u4, VBinaryCMicrokernelTester::OpType::ORC); + } +} + +TEST(S32_VORC__SCALAR_U4, batch_lt_4) { + for (size_t batch_size = 1; batch_size < 4; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__scalar_u4, VBinaryCMicrokernelTester::OpType::ORC); + } +} + +TEST(S32_VORC__SCALAR_U4, batch_gt_4) { + for (size_t batch_size = 5; batch_size < 8; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__scalar_u4, VBinaryCMicrokernelTester::OpType::ORC); + } +} + +TEST(S32_VORC__SCALAR_U4, inplace) { + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__scalar_u4, VBinaryCMicrokernelTester::OpType::ORC); + } +} + + +TEST(S32_VORC__SCALAR_U8, batch_eq_8) { + VBinaryCMicrokernelTester() + .batch_size(8) + .Test(xnn_s32_vorc_ukernel__scalar_u8, VBinaryCMicrokernelTester::OpType::ORC); +} + +TEST(S32_VORC__SCALAR_U8, batch_div_8) { + for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__scalar_u8, VBinaryCMicrokernelTester::OpType::ORC); + } +} + +TEST(S32_VORC__SCALAR_U8, batch_lt_8) { + for (size_t batch_size = 1; batch_size < 8; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__scalar_u8, VBinaryCMicrokernelTester::OpType::ORC); + } +} + +TEST(S32_VORC__SCALAR_U8, batch_gt_8) { + for (size_t batch_size = 9; batch_size < 16; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__scalar_u8, VBinaryCMicrokernelTester::OpType::ORC); + } +} + +TEST(S32_VORC__SCALAR_U8, inplace) { + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__scalar_u8, VBinaryCMicrokernelTester::OpType::ORC); + } +} + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VORC__SSE41_U4, batch_eq_4) { + TEST_REQUIRES_X86_SSE41; + VBinaryCMicrokernelTester() + .batch_size(4) + .Test(xnn_s32_vorc_ukernel__sse41_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__SSE41_U4, batch_div_4) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__sse41_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__SSE41_U4, batch_lt_4) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size < 4; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__sse41_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__SSE41_U4, batch_gt_4) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 5; batch_size < 8; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__sse41_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__SSE41_U4, inplace) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__sse41_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VORC__SSE41_U8, batch_eq_8) { + TEST_REQUIRES_X86_SSE41; + VBinaryCMicrokernelTester() + .batch_size(8) + .Test(xnn_s32_vorc_ukernel__sse41_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__SSE41_U8, batch_div_8) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__sse41_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__SSE41_U8, batch_lt_8) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size < 8; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__sse41_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__SSE41_U8, batch_gt_8) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 9; batch_size < 16; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__sse41_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__SSE41_U8, inplace) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__sse41_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VORC__SSE41_U12, batch_eq_12) { + TEST_REQUIRES_X86_SSE41; + VBinaryCMicrokernelTester() + .batch_size(12) + .Test(xnn_s32_vorc_ukernel__sse41_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__SSE41_U12, batch_div_12) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 24; batch_size < 120; batch_size += 12) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__sse41_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__SSE41_U12, batch_lt_12) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size < 12; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__sse41_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__SSE41_U12, batch_gt_12) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 13; batch_size < 24; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__sse41_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__SSE41_U12, inplace) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__sse41_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VORC__SSE41_U16, batch_eq_16) { + TEST_REQUIRES_X86_SSE41; + VBinaryCMicrokernelTester() + .batch_size(16) + .Test(xnn_s32_vorc_ukernel__sse41_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__SSE41_U16, batch_div_16) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__sse41_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__SSE41_U16, batch_lt_16) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size < 16; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__sse41_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__SSE41_U16, batch_gt_16) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 17; batch_size < 32; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__sse41_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__SSE41_U16, inplace) { + TEST_REQUIRES_X86_SSE41; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__sse41_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VORC__AVX2_U8, batch_eq_8) { + TEST_REQUIRES_X86_AVX2; + VBinaryCMicrokernelTester() + .batch_size(8) + .Test(xnn_s32_vorc_ukernel__avx2_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__AVX2_U8, batch_div_8) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx2_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX2_U8, batch_lt_8) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size < 8; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx2_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX2_U8, batch_gt_8) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 9; batch_size < 16; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx2_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX2_U8, inplace) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__avx2_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VORC__AVX2_U16, batch_eq_16) { + TEST_REQUIRES_X86_AVX2; + VBinaryCMicrokernelTester() + .batch_size(16) + .Test(xnn_s32_vorc_ukernel__avx2_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__AVX2_U16, batch_div_16) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx2_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX2_U16, batch_lt_16) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size < 16; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx2_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX2_U16, batch_gt_16) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 17; batch_size < 32; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx2_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX2_U16, inplace) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__avx2_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VORC__AVX2_U24, batch_eq_24) { + TEST_REQUIRES_X86_AVX2; + VBinaryCMicrokernelTester() + .batch_size(24) + .Test(xnn_s32_vorc_ukernel__avx2_u24, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__AVX2_U24, batch_div_24) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx2_u24, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX2_U24, batch_lt_24) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size < 24; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx2_u24, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX2_U24, batch_gt_24) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 25; batch_size < 48; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx2_u24, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX2_U24, inplace) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__avx2_u24, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VORC__AVX2_U32, batch_eq_32) { + TEST_REQUIRES_X86_AVX2; + VBinaryCMicrokernelTester() + .batch_size(32) + .Test(xnn_s32_vorc_ukernel__avx2_u32, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__AVX2_U32, batch_div_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx2_u32, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX2_U32, batch_lt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size < 32; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx2_u32, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX2_U32, batch_gt_32) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 33; batch_size < 64; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx2_u32, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX2_U32, inplace) { + TEST_REQUIRES_X86_AVX2; + for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__avx2_u32, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VORC__AVX512F_U16, batch_eq_16) { + TEST_REQUIRES_X86_AVX512F; + VBinaryCMicrokernelTester() + .batch_size(16) + .Test(xnn_s32_vorc_ukernel__avx512f_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__AVX512F_U16, batch_div_16) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx512f_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX512F_U16, batch_lt_16) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size < 16; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx512f_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX512F_U16, batch_gt_16) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 17; batch_size < 32; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx512f_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX512F_U16, inplace) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__avx512f_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VORC__AVX512F_U32, batch_eq_32) { + TEST_REQUIRES_X86_AVX512F; + VBinaryCMicrokernelTester() + .batch_size(32) + .Test(xnn_s32_vorc_ukernel__avx512f_u32, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__AVX512F_U32, batch_div_32) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx512f_u32, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX512F_U32, batch_lt_32) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size < 32; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx512f_u32, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX512F_U32, batch_gt_32) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 33; batch_size < 64; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx512f_u32, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX512F_U32, inplace) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__avx512f_u32, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VORC__AVX512F_U48, batch_eq_48) { + TEST_REQUIRES_X86_AVX512F; + VBinaryCMicrokernelTester() + .batch_size(48) + .Test(xnn_s32_vorc_ukernel__avx512f_u48, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__AVX512F_U48, batch_div_48) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 96; batch_size < 480; batch_size += 48) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx512f_u48, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX512F_U48, batch_lt_48) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size < 48; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx512f_u48, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX512F_U48, batch_gt_48) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 49; batch_size < 96; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx512f_u48, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX512F_U48, inplace) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 240; batch_size += 47) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__avx512f_u48, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + TEST(S32_VORC__AVX512F_U64, batch_eq_64) { + TEST_REQUIRES_X86_AVX512F; + VBinaryCMicrokernelTester() + .batch_size(64) + .Test(xnn_s32_vorc_ukernel__avx512f_u64, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__AVX512F_U64, batch_div_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 128; batch_size < 640; batch_size += 64) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx512f_u64, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX512F_U64, batch_lt_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size < 64; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx512f_u64, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX512F_U64, batch_gt_64) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 65; batch_size < 128; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__avx512f_u64, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__AVX512F_U64, inplace) { + TEST_REQUIRES_X86_AVX512F; + for (size_t batch_size = 1; batch_size <= 320; batch_size += 63) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__avx512f_u64, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + TEST(S32_VORC__WASMSIMD_U4, batch_eq_4) { + VBinaryCMicrokernelTester() + .batch_size(4) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__WASMSIMD_U4, batch_div_4) { + for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__WASMSIMD_U4, batch_lt_4) { + for (size_t batch_size = 1; batch_size < 4; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__WASMSIMD_U4, batch_gt_4) { + for (size_t batch_size = 5; batch_size < 8; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__WASMSIMD_U4, inplace) { + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + TEST(S32_VORC__WASMSIMD_U8, batch_eq_8) { + VBinaryCMicrokernelTester() + .batch_size(8) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__WASMSIMD_U8, batch_div_8) { + for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__WASMSIMD_U8, batch_lt_8) { + for (size_t batch_size = 1; batch_size < 8; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__WASMSIMD_U8, batch_gt_8) { + for (size_t batch_size = 9; batch_size < 16; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__WASMSIMD_U8, inplace) { + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + TEST(S32_VORC__WASMSIMD_U12, batch_eq_12) { + VBinaryCMicrokernelTester() + .batch_size(12) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__WASMSIMD_U12, batch_div_12) { + for (size_t batch_size = 24; batch_size < 120; batch_size += 12) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__WASMSIMD_U12, batch_lt_12) { + for (size_t batch_size = 1; batch_size < 12; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__WASMSIMD_U12, batch_gt_12) { + for (size_t batch_size = 13; batch_size < 24; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__WASMSIMD_U12, inplace) { + for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + TEST(S32_VORC__WASMSIMD_U16, batch_eq_16) { + VBinaryCMicrokernelTester() + .batch_size(16) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__WASMSIMD_U16, batch_div_16) { + for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__WASMSIMD_U16, batch_lt_16) { + for (size_t batch_size = 1; batch_size < 16; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__WASMSIMD_U16, batch_gt_16) { + for (size_t batch_size = 17; batch_size < 32; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__WASMSIMD_U16, inplace) { + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__wasmsimd_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 + TEST(S32_VORC__NEON_U4, batch_eq_4) { + TEST_REQUIRES_ARM_NEON; + VBinaryCMicrokernelTester() + .batch_size(4) + .Test(xnn_s32_vorc_ukernel__neon_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__NEON_U4, batch_div_4) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__neon_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__NEON_U4, batch_lt_4) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size < 4; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__neon_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__NEON_U4, batch_gt_4) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 5; batch_size < 8; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__neon_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__NEON_U4, inplace) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__neon_u4, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 + TEST(S32_VORC__NEON_U8, batch_eq_8) { + TEST_REQUIRES_ARM_NEON; + VBinaryCMicrokernelTester() + .batch_size(8) + .Test(xnn_s32_vorc_ukernel__neon_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__NEON_U8, batch_div_8) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__neon_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__NEON_U8, batch_lt_8) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size < 8; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__neon_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__NEON_U8, batch_gt_8) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 9; batch_size < 16; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__neon_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__NEON_U8, inplace) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__neon_u8, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 + TEST(S32_VORC__NEON_U12, batch_eq_12) { + TEST_REQUIRES_ARM_NEON; + VBinaryCMicrokernelTester() + .batch_size(12) + .Test(xnn_s32_vorc_ukernel__neon_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__NEON_U12, batch_div_12) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 24; batch_size < 120; batch_size += 12) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__neon_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__NEON_U12, batch_lt_12) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size < 12; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__neon_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__NEON_U12, batch_gt_12) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 13; batch_size < 24; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__neon_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__NEON_U12, inplace) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__neon_u12, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 + TEST(S32_VORC__NEON_U16, batch_eq_16) { + TEST_REQUIRES_ARM_NEON; + VBinaryCMicrokernelTester() + .batch_size(16) + .Test(xnn_s32_vorc_ukernel__neon_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + + TEST(S32_VORC__NEON_U16, batch_div_16) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__neon_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__NEON_U16, batch_lt_16) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size < 16; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__neon_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__NEON_U16, batch_gt_16) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 17; batch_size < 32; batch_size++) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_s32_vorc_ukernel__neon_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } + + TEST(S32_VORC__NEON_U16, inplace) { + TEST_REQUIRES_ARM_NEON; + for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { + VBinaryCMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(xnn_s32_vorc_ukernel__neon_u16, VBinaryCMicrokernelTester::OpType::ORC); + } + } +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/test/s32-vorc.yaml b/test/s32-vorc.yaml new file mode 100644 index 00000000000..79c15d7f047 --- /dev/null +++ b/test/s32-vorc.yaml @@ -0,0 +1,40 @@ +# Copyright 2024 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Scalar +- name: xnn_s32_vorc_ukernel__scalar_u1 +- name: xnn_s32_vorc_ukernel__scalar_u2 +- name: xnn_s32_vorc_ukernel__scalar_u4 +- name: xnn_s32_vorc_ukernel__scalar_u8 + +# x86 SSE41 +- name: xnn_s32_vorc_ukernel__sse41_u4 +- name: xnn_s32_vorc_ukernel__sse41_u8 +- name: xnn_s32_vorc_ukernel__sse41_u12 +- name: xnn_s32_vorc_ukernel__sse41_u16 + +# x86 AVX2 +- name: xnn_s32_vorc_ukernel__avx2_u8 +- name: xnn_s32_vorc_ukernel__avx2_u16 +- name: xnn_s32_vorc_ukernel__avx2_u24 +- name: xnn_s32_vorc_ukernel__avx2_u32 + +# x86 AVX512F +- name: xnn_s32_vorc_ukernel__avx512f_u16 +- name: xnn_s32_vorc_ukernel__avx512f_u32 +- name: xnn_s32_vorc_ukernel__avx512f_u48 +- name: xnn_s32_vorc_ukernel__avx512f_u64 + +# Wasm SIMD +- name: xnn_s32_vorc_ukernel__wasmsimd_u4 +- name: xnn_s32_vorc_ukernel__wasmsimd_u8 +- name: xnn_s32_vorc_ukernel__wasmsimd_u12 +- name: xnn_s32_vorc_ukernel__wasmsimd_u16 + +# ARM NEON +- name: xnn_s32_vorc_ukernel__neon_u4 +- name: xnn_s32_vorc_ukernel__neon_u8 +- name: xnn_s32_vorc_ukernel__neon_u12 +- name: xnn_s32_vorc_ukernel__neon_u16 diff --git a/test/vbinary-microkernel-tester.cc b/test/vbinary-microkernel-tester.cc index ea557db1bfb..a2f94227d00 100644 --- a/test/vbinary-microkernel-tester.cc +++ b/test/vbinary-microkernel-tester.cc @@ -324,6 +324,9 @@ void VBinaryMicrokernelTester::Test( // Overflow is the expected behaviour y_ref[i] = ((((int64_t) a_data[i] * (int64_t) b_data[i]) << 32) >> 32); break; + case OpType::OR: + y_ref[i] = a_data[i] | b_data[i]; + break; case OpType::SqrDiff: { const int32_t diff = a_data[i] - b_data[i]; y_ref[i] = diff * diff; diff --git a/test/vbinary-microkernel-tester.h b/test/vbinary-microkernel-tester.h index a0561eaadd5..446a0eeb116 100644 --- a/test/vbinary-microkernel-tester.h +++ b/test/vbinary-microkernel-tester.h @@ -26,6 +26,7 @@ class VBinaryMicrokernelTester { Max, Min, Mul, + OR, Sub, SqrDiff, }; diff --git a/test/vbinaryc-microkernel-tester.cc b/test/vbinaryc-microkernel-tester.cc index d534f7d0672..49def1cf1c5 100644 --- a/test/vbinaryc-microkernel-tester.cc +++ b/test/vbinaryc-microkernel-tester.cc @@ -351,6 +351,9 @@ void VBinaryCMicrokernelTester::Test( // Overflow is the expected behaviour y_ref[i] = ((((int64_t) a_data[i] * (int64_t) b) << 32) >> 32); break; + case OpType::ORC: + y_ref[i] = a_data[i] | b; + break; case OpType::SqrDiffC: { const int32_t diff = a_data[i] - b; y_ref[i] = diff * diff; diff --git a/test/vbinaryc-microkernel-tester.h b/test/vbinaryc-microkernel-tester.h index 2bc1f5327c2..3b11207289e 100644 --- a/test/vbinaryc-microkernel-tester.h +++ b/test/vbinaryc-microkernel-tester.h @@ -27,6 +27,7 @@ class VBinaryCMicrokernelTester { MaxC, MinC, MulC, + ORC, SqrDiffC, SubC, RSubC, diff --git a/tools/generate-vbinary-test.py b/tools/generate-vbinary-test.py index 3a3796d95f7..678b0212464 100755 --- a/tools/generate-vbinary-test.py +++ b/tools/generate-vbinary-test.py @@ -32,7 +32,7 @@ def split_ukernel_name(name): - match = re.fullmatch(r"xnn_(qu8|qs8|f16|f32|s32)_v(add|cmul|copysign|div|max|min|mul|sqrdiff|sub|addc|copysignc|rcopysignc|divc|rdivc|maxc|minc|mulc|sqrdiffc|subc|rsubc)(_(minmax|relu)(_(fp32|rndnu))?)?_ukernel__(.+)_u(\d+)(v)?", name) + match = re.fullmatch(r"xnn_(qu8|qs8|f16|f32|s32)_v(add|cmul|copysign|div|max|min|mul|sqrdiff|sub|addc|copysignc|rcopysignc|divc|rdivc|maxc|minc|mulc|sqrdiffc|subc|rsubc|or|orc)(_(minmax|relu)(_(fp32|rndnu))?)?_ukernel__(.+)_u(\d+)(v)?", name) if match is None: raise ValueError("Unexpected microkernel name: " + name) op_type = { @@ -53,6 +53,8 @@ def split_ukernel_name(name): "maxc": "MaxC", "minc": "MinC", "mulc": "MulC", + "or": "OR", + "orc": "ORC", "sqrdiffc": "SqrDiffC", "subc": "SubC", "rsubc": "RSubC",