Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Xnn s32 or #6823

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ SUBGRAPH_SRCS = [
"src/subgraph/minimum2.c",
"src/subgraph/multiply2.c",
"src/subgraph/negate.c",
"src/subgraph/or.c",
"src/subgraph/prelu.c",
"src/subgraph/reciprocal-square-root.c",
"src/subgraph/reshape-helpers.c",
Expand Down
23 changes: 23 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,7 @@ SET(SUBGRAPH_SRCS
src/subgraph/minimum2.c
src/subgraph/multiply2.c
src/subgraph/negate.c
src/subgraph/or.c
src/subgraph/prelu.c
src/subgraph/reciprocal-square-root.c
src/subgraph/reshape-helpers.c
Expand Down Expand Up @@ -1627,6 +1628,11 @@ IF(XNNPACK_BUILD_TESTS)
TARGET_LINK_LIBRARIES(negate-nc-test PRIVATE XNNPACK unary-operator-tester fp16 GTest::gtest GTest::gtest_main)
ADD_TEST(NAME negate-nc-test COMMAND negate-nc-test)

ADD_EXECUTABLE(or-nd-test test/or-nd.cc)
TARGET_INCLUDE_DIRECTORIES(or-nd-test PRIVATE src test)
TARGET_LINK_LIBRARIES(or-nd-test PRIVATE XNNPACK binary-elementwise-operator-tester fp16 GTest::gtest GTest::gtest_main)
ADD_TEST(NAME or-nd-test COMMAND or-nd-test)

ADD_EXECUTABLE(prelu-nc-test test/prelu-nc.cc)
TARGET_INCLUDE_DIRECTORIES(prelu-nc-test PRIVATE src test)
TARGET_LINK_LIBRARIES(prelu-nc-test PRIVATE XNNPACK fp16 GTest::gtest GTest::gtest_main)
Expand Down Expand Up @@ -1982,6 +1988,11 @@ IF(XNNPACK_BUILD_TESTS)
TARGET_LINK_LIBRARIES(negate-test PRIVATE XNNPACK fp16 GTest::gtest GTest::gtest_main subgraph)
ADD_TEST(NAME negate-test COMMAND negate-test)

ADD_EXECUTABLE(or-test test/or.cc)
TARGET_INCLUDE_DIRECTORIES(or-test PRIVATE src test)
TARGET_LINK_LIBRARIES(or-test PRIVATE XNNPACK fp16 GTest::gtest GTest::gtest_main subgraph)
ADD_TEST(NAME or-test COMMAND or-test)

ADD_EXECUTABLE(prelu-test test/prelu.cc)
TARGET_INCLUDE_DIRECTORIES(prelu-test PRIVATE src test)
TARGET_LINK_LIBRARIES(prelu-test PRIVATE XNNPACK fp16 GTest::gtest GTest::gtest_main subgraph)
Expand Down Expand Up @@ -2789,6 +2800,18 @@ IF(XNNPACK_BUILD_TESTS)
TARGET_LINK_LIBRARIES(s32-vmulc-test PRIVATE vbinaryc-microkernel-tester hardware-config logging microkernels-all microparams-init)
ADD_TEST(NAME s32-vmulc-test COMMAND s32-vmulc-test)

ADD_EXECUTABLE(s32-vor-test test/s32-vor.cc)
TARGET_INCLUDE_DIRECTORIES(s32-vor-test PRIVATE include src test)
TARGET_LINK_LIBRARIES(s32-vor-test PRIVATE fp16 pthreadpool GTest::gtest GTest::gtest_main)
TARGET_LINK_LIBRARIES(s32-vor-test PRIVATE vbinary-microkernel-tester hardware-config logging microkernels-all microparams-init)
ADD_TEST(NAME s32-vor-test COMMAND s32-vor-test)

ADD_EXECUTABLE(s32-vorc-test test/s32-vorc.cc)
TARGET_INCLUDE_DIRECTORIES(s32-vorc-test PRIVATE include src test)
TARGET_LINK_LIBRARIES(s32-vorc-test PRIVATE fp16 pthreadpool GTest::gtest GTest::gtest_main)
TARGET_LINK_LIBRARIES(s32-vorc-test PRIVATE vbinaryc-microkernel-tester hardware-config logging microkernels-all microparams-init)
ADD_TEST(NAME s32-vorc-test COMMAND s32-vorc-test)

ADD_EXECUTABLE(f16-vcmul-test test/f16-vcmul.cc)
SET_TARGET_PROPERTIES(f16-vcmul-test PROPERTIES CXX_EXTENSIONS YES)
TARGET_INCLUDE_DIRECTORIES(f16-vcmul-test PRIVATE include src test)
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/avx2_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,8 @@ SET(ALL_AVX2_MICROKERNEL_SRCS
src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c
src/s32-vmul/gen/s32-vmul-avx2.c
src/s32-vmul/gen/s32-vmulc-avx2.c
src/s32-vor/gen/s32-vor-avx2.c
src/s32-vor/gen/s32-vorc-avx2.c
src/x8-lut/gen/x8-lut-avx2-u32.c
src/x8-lut/gen/x8-lut-avx2-u64.c
src/x8-lut/gen/x8-lut-avx2-u96.c
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/avx512f_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -311,5 +311,7 @@ SET(ALL_AVX512F_MICROKERNEL_SRCS
src/math/f32-sqrt-avx512f-nr2fma.c
src/s32-vmul/gen/s32-vmul-avx512f.c
src/s32-vmul/gen/s32-vmulc-avx512f.c
src/s32-vor/gen/s32-vor-avx512f.c
src/s32-vor/gen/s32-vorc-avx512f.c
src/x32-packw/gen/x32-packw-x16-gemm-goi-avx512f-u4-prfm.c
src/x32-packw/gen/x32-packw-x16-gemm-goi-avx512f-u4.c)
2 changes: 2 additions & 0 deletions cmake/gen/neon_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -894,6 +894,8 @@ SET(ALL_NEON_MICROKERNEL_SRCS
src/s16-window/gen/s16-window-shift15-neon-u32.c
src/s32-vmul/gen/s32-vmul-neon.c
src/s32-vmul/gen/s32-vmulc-neon.c
src/s32-vor/gen/s32-vor-neon.c
src/s32-vor/gen/s32-vorc-neon.c
src/u8-ibilinear/gen/u8-ibilinear-neon-c8.c
src/u8-ibilinear/gen/u8-ibilinear-neon-c16.c
src/u8-maxpool/u8-maxpool-9p8x-minmax-neon-c16.c
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/scalar_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1052,6 +1052,8 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
src/s16-window/gen/s16-window-scalar-u4.c
src/s32-vmul/gen/s32-vmul-scalar.c
src/s32-vmul/gen/s32-vmulc-scalar.c
src/s32-vor/gen/s32-vor-scalar.c
src/s32-vor/gen/s32-vorc-scalar.c
src/u8-ibilinear/gen/u8-ibilinear-scalar-c1.c
src/u8-ibilinear/gen/u8-ibilinear-scalar-c2.c
src/u8-ibilinear/gen/u8-ibilinear-scalar-c4.c
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/sse41_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -381,5 +381,7 @@ SET(ALL_SSE41_MICROKERNEL_SRCS
src/s8-vclamp/s8-vclamp-sse41-u64.c
src/s32-vmul/gen/s32-vmul-sse41.c
src/s32-vmul/gen/s32-vmulc-sse41.c
src/s32-vor/gen/s32-vor-sse41.c
src/s32-vor/gen/s32-vorc-sse41.c
src/u8-ibilinear/gen/u8-ibilinear-sse41-c8.c
src/u8-ibilinear/gen/u8-ibilinear-sse41-c16.c)
2 changes: 2 additions & 0 deletions cmake/gen/wasmsimd_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1190,6 +1190,8 @@ SET(ALL_WASMSIMD_MICROKERNEL_SRCS
src/s8-vclamp/s8-vclamp-wasmsimd-u64.c
src/s32-vmul/gen/s32-vmul-wasmsimd.c
src/s32-vmul/gen/s32-vmulc-wasmsimd.c
src/s32-vor/gen/s32-vor-wasmsimd.c
src/s32-vor/gen/s32-vorc-wasmsimd.c
src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c8.c
src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c16.c
src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-mul32-c8.c
Expand Down
2 changes: 2 additions & 0 deletions gen/avx2_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,8 @@ ALL_AVX2_MICROKERNEL_SRCS = [
"src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c",
"src/s32-vmul/gen/s32-vmul-avx2.c",
"src/s32-vmul/gen/s32-vmulc-avx2.c",
"src/s32-vor/gen/s32-vor-avx2.c",
"src/s32-vor/gen/s32-vorc-avx2.c",
"src/x8-lut/gen/x8-lut-avx2-u32.c",
"src/x8-lut/gen/x8-lut-avx2-u64.c",
"src/x8-lut/gen/x8-lut-avx2-u96.c",
Expand Down
2 changes: 2 additions & 0 deletions gen/avx512f_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,8 @@ ALL_AVX512F_MICROKERNEL_SRCS = [
"src/math/f32-sqrt-avx512f-nr2fma.c",
"src/s32-vmul/gen/s32-vmul-avx512f.c",
"src/s32-vmul/gen/s32-vmulc-avx512f.c",
"src/s32-vor/gen/s32-vor-avx512f.c",
"src/s32-vor/gen/s32-vorc-avx512f.c",
"src/x32-packw/gen/x32-packw-x16-gemm-goi-avx512f-u4-prfm.c",
"src/x32-packw/gen/x32-packw-x16-gemm-goi-avx512f-u4.c",
]
2 changes: 2 additions & 0 deletions gen/neon_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -890,6 +890,8 @@ ALL_NEON_MICROKERNEL_SRCS = [
"src/s16-window/gen/s16-window-shift15-neon-u32.c",
"src/s32-vmul/gen/s32-vmul-neon.c",
"src/s32-vmul/gen/s32-vmulc-neon.c",
"src/s32-vor/gen/s32-vor-neon.c",
"src/s32-vor/gen/s32-vorc-neon.c",
"src/u8-ibilinear/gen/u8-ibilinear-neon-c8.c",
"src/u8-ibilinear/gen/u8-ibilinear-neon-c16.c",
"src/u8-maxpool/u8-maxpool-9p8x-minmax-neon-c16.c",
Expand Down
2 changes: 2 additions & 0 deletions gen/scalar_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -1048,6 +1048,8 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
"src/s16-window/gen/s16-window-scalar-u4.c",
"src/s32-vmul/gen/s32-vmul-scalar.c",
"src/s32-vmul/gen/s32-vmulc-scalar.c",
"src/s32-vor/gen/s32-vor-scalar.c",
"src/s32-vor/gen/s32-vorc-scalar.c",
"src/u8-ibilinear/gen/u8-ibilinear-scalar-c1.c",
"src/u8-ibilinear/gen/u8-ibilinear-scalar-c2.c",
"src/u8-ibilinear/gen/u8-ibilinear-scalar-c4.c",
Expand Down
2 changes: 2 additions & 0 deletions gen/sse41_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,8 @@ ALL_SSE41_MICROKERNEL_SRCS = [
"src/s8-vclamp/s8-vclamp-sse41-u64.c",
"src/s32-vmul/gen/s32-vmul-sse41.c",
"src/s32-vmul/gen/s32-vmulc-sse41.c",
"src/s32-vor/gen/s32-vor-sse41.c",
"src/s32-vor/gen/s32-vorc-sse41.c",
"src/u8-ibilinear/gen/u8-ibilinear-sse41-c8.c",
"src/u8-ibilinear/gen/u8-ibilinear-sse41-c16.c",
]
2 changes: 2 additions & 0 deletions gen/wasmsimd_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -1186,6 +1186,8 @@ ALL_WASMSIMD_MICROKERNEL_SRCS = [
"src/s8-vclamp/s8-vclamp-wasmsimd-u64.c",
"src/s32-vmul/gen/s32-vmul-wasmsimd.c",
"src/s32-vmul/gen/s32-vmulc-wasmsimd.c",
"src/s32-vor/gen/s32-vor-wasmsimd.c",
"src/s32-vor/gen/s32-vorc-wasmsimd.c",
"src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c8.c",
"src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c16.c",
"src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-mul32-c8.c",
Expand Down
34 changes: 34 additions & 0 deletions include/xnnpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -1789,6 +1789,22 @@ enum xnn_status xnn_define_negate(
uint32_t output_id,
uint32_t flags);

/// Define a Bitwsie OR Node and add it to a Subgraph.
///
/// The OR node peforms bitwise OR between first and second input.
///
/// @param subgraph - a Subgraph object that will own the created Node.
/// @param input1_id - Value ID for the first input tensor. The input tensor must be defined in the @a subgraph.
/// @param input2_id - Value ID for the second input tensor. The input tensor must be defined in the @a subgraph.
/// @param output_id - Value ID for the output tensor.
/// @param flags - binary features of the OR Node. No supported flags are currently defined.
enum xnn_status xnn_define_or(
xnn_subgraph_t subgraph,
uint32_t input1_id,
uint32_t input2_id,
uint32_t output_id,
uint32_t flags);

/// Define a Sigmoid Node and add it to a Subgraph.
///
/// @param subgraph - a Subgraph object that will own the created Node.
Expand Down Expand Up @@ -5437,6 +5453,24 @@ enum xnn_status xnn_run_negate_nc_f32(
uint32_t flags,
pthreadpool_t threadpool);

enum xnn_status xnn_create_or_nd_s32(
uint32_t flags,
xnn_operator_t* or_op_out);

enum xnn_status xnn_reshape_or_nd_s32(
xnn_operator_t or_op,
size_t num_input1_dims,
const size_t* input1_shape,
size_t num_input2_dims,
const size_t* input2_shape,
pthreadpool_t threadpool);

enum xnn_status xnn_setup_or_nd_s32(
xnn_operator_t or_op,
const int32_t* input1,
const int32_t* input2,
int32_t* output);

enum xnn_status xnn_create_prelu_nc_f16(
size_t input_channels,
size_t slope_channels,
Expand Down
23 changes: 23 additions & 0 deletions scripts/generate-s32-vor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/sh
# Copyright 2024 Google LLC
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

##################################### SIMD VOR #####################################
tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=scalar -D BATCH_TILES=1,2,4,8 -o src/s32-vor/gen/s32-vor-scalar.c &
tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=sse41 -D BATCH_TILES=4,8,12,16 -o src/s32-vor/gen/s32-vor-sse41.c &
tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=wasmsimd -D BATCH_TILES=4,8,12,16 -o src/s32-vor/gen/s32-vor-wasmsimd.c &
tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=neon -D BATCH_TILES=4,8,12,16 -o src/s32-vor/gen/s32-vor-neon.c &
tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=avx2 -D BATCH_TILES=8,16,24,32 -o src/s32-vor/gen/s32-vor-avx2.c &
tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64 -o src/s32-vor/gen/s32-vor-avx512f.c &

##################################### SIMD VORC #####################################
tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=scalar -D BATCH_TILES=1,2,4,8 -o src/s32-vor/gen/s32-vorc-scalar.c &
tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=sse41 -D BATCH_TILES=4,8,12,16 -o src/s32-vor/gen/s32-vorc-sse41.c &
tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=wasmsimd -D BATCH_TILES=4,8,12,16 -o src/s32-vor/gen/s32-vorc-wasmsimd.c &
tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=neon -D BATCH_TILES=4,8,12,16 -o src/s32-vor/gen/s32-vorc-neon.c &
tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=avx2 -D BATCH_TILES=8,16,24,32 -o src/s32-vor/gen/s32-vorc-avx2.c &
tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64 -o src/s32-vor/gen/s32-vorc-avx512f.c &

wait
3 changes: 3 additions & 0 deletions scripts/generate-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,9 @@ tools/generate-vbinary-test.py --tester VBinaryCMicrokernelTester --spec test/qu
tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --spec test/s32-vmul.yaml --output test/s32-vmul.cc &
tools/generate-vbinary-test.py --tester VBinaryCMicrokernelTester --spec test/s32-vmulc.yaml --output test/s32-vmulc.cc &

tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --spec test/s32-vor.yaml --output test/s32-vor.cc &
tools/generate-vbinary-test.py --tester VBinaryCMicrokernelTester --spec test/s32-vorc.yaml --output test/s32-vorc.cc &

### Tests for VUnary micro-kernels
tools/generate-vunary-test.py --spec test/bf16-vabs.yaml --output test/bf16-vabs.cc &

Expand Down
100 changes: 100 additions & 0 deletions src/amalgam/gen/avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -15832,3 +15832,103 @@ void xnn_s32_vmulc_ukernel__avx2_u16(
xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
}
}

void xnn_s32_vor_ukernel__avx2_u16(
size_t batch,
const int32_t* input_a,
const int32_t* input_b,
int32_t* output,
const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(batch != 0);
assert(batch % sizeof(int32_t) == 0);
assert(input_b != NULL);
assert(input_a != NULL);
assert(output != NULL);
assert(xnn_simd_size_s32 == 8);

for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
input_a += 16;

xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
input_b += 16;

xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);

xnn_storeu_s32(output, vy_0);
xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
output += 16;
}
for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
input_a += xnn_simd_size_s32;

xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
input_b += xnn_simd_size_s32;

xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);

xnn_storeu_s32(output, vy);
output += xnn_simd_size_s32;
}
if XNN_UNLIKELY(batch != 0) {
xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);

xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);

xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);

xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
}
}

void xnn_s32_vorc_ukernel__avx2_u16(
size_t batch,
const int32_t* input1,
const int32_t* input2,
int32_t* output,
const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(batch != 0);
assert(batch % sizeof(int32_t) == 0);
assert(input1 != NULL);
assert(input2 != NULL);
assert(output != NULL);
assert(xnn_simd_size_s32 == 8);

xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);

for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {

xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
input1 += 16;

xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);

xnn_storeu_s32(output, vy_0);
xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
output += 16;
}
for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
input1 += xnn_simd_size_s32;

xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);

xnn_storeu_s32(output, vy);
output += xnn_simd_size_s32;
}
if XNN_UNLIKELY(batch != 0) {
xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));

xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);

xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
}
}
Loading