diff --git a/BUILD.bazel b/BUILD.bazel
index 16fbf7adac4..c4be242dc19 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -160,6 +160,7 @@ SUBGRAPH_SRCS = [
     "src/subgraph/minimum2.c",
     "src/subgraph/multiply2.c",
     "src/subgraph/negate.c",
+    "src/subgraph/or.c",
     "src/subgraph/prelu.c",
     "src/subgraph/reciprocal-square-root.c",
     "src/subgraph/reshape-helpers.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7d39c361fdf..c59808fa51c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -411,6 +411,7 @@ SET(SUBGRAPH_SRCS
   src/subgraph/minimum2.c
   src/subgraph/multiply2.c
   src/subgraph/negate.c
+  src/subgraph/or.c
   src/subgraph/prelu.c
   src/subgraph/reciprocal-square-root.c
   src/subgraph/reshape-helpers.c
@@ -1627,6 +1628,11 @@ IF(XNNPACK_BUILD_TESTS)
     TARGET_LINK_LIBRARIES(negate-nc-test PRIVATE XNNPACK unary-operator-tester fp16 GTest::gtest GTest::gtest_main)
     ADD_TEST(NAME negate-nc-test COMMAND negate-nc-test)
 
+    ADD_EXECUTABLE(or-nd-test test/or-nd.cc)
+    TARGET_INCLUDE_DIRECTORIES(or-nd-test PRIVATE src test)
+    TARGET_LINK_LIBRARIES(or-nd-test PRIVATE XNNPACK binary-elementwise-operator-tester fp16 GTest::gtest GTest::gtest_main)
+    ADD_TEST(NAME or-nd-test COMMAND or-nd-test)
+
     ADD_EXECUTABLE(prelu-nc-test test/prelu-nc.cc)
     TARGET_INCLUDE_DIRECTORIES(prelu-nc-test PRIVATE src test)
     TARGET_LINK_LIBRARIES(prelu-nc-test PRIVATE XNNPACK fp16 GTest::gtest GTest::gtest_main)
@@ -1982,6 +1988,11 @@ IF(XNNPACK_BUILD_TESTS)
     TARGET_LINK_LIBRARIES(negate-test PRIVATE XNNPACK fp16 GTest::gtest GTest::gtest_main subgraph)
     ADD_TEST(NAME negate-test COMMAND negate-test)
 
+    ADD_EXECUTABLE(or-test test/or.cc)
+    TARGET_INCLUDE_DIRECTORIES(or-test PRIVATE src test)
+    TARGET_LINK_LIBRARIES(or-test PRIVATE XNNPACK fp16 GTest::gtest GTest::gtest_main subgraph)
+    ADD_TEST(NAME or-test COMMAND or-test)
+
     ADD_EXECUTABLE(prelu-test test/prelu.cc)
     TARGET_INCLUDE_DIRECTORIES(prelu-test PRIVATE src test)
     TARGET_LINK_LIBRARIES(prelu-test PRIVATE XNNPACK fp16 GTest::gtest GTest::gtest_main subgraph)
@@ -2789,6 +2800,18 @@ IF(XNNPACK_BUILD_TESTS)
   TARGET_LINK_LIBRARIES(s32-vmulc-test PRIVATE vbinaryc-microkernel-tester hardware-config logging microkernels-all microparams-init)
   ADD_TEST(NAME s32-vmulc-test COMMAND s32-vmulc-test)
 
+  ADD_EXECUTABLE(s32-vor-test test/s32-vor.cc)
+  TARGET_INCLUDE_DIRECTORIES(s32-vor-test PRIVATE include src test)
+  TARGET_LINK_LIBRARIES(s32-vor-test PRIVATE fp16 pthreadpool GTest::gtest GTest::gtest_main)
+  TARGET_LINK_LIBRARIES(s32-vor-test PRIVATE vbinary-microkernel-tester hardware-config logging microkernels-all microparams-init)
+  ADD_TEST(NAME s32-vor-test COMMAND s32-vor-test)
+
+  ADD_EXECUTABLE(s32-vorc-test test/s32-vorc.cc)
+  TARGET_INCLUDE_DIRECTORIES(s32-vorc-test PRIVATE include src test)
+  TARGET_LINK_LIBRARIES(s32-vorc-test PRIVATE fp16 pthreadpool GTest::gtest GTest::gtest_main)
+  TARGET_LINK_LIBRARIES(s32-vorc-test PRIVATE vbinaryc-microkernel-tester hardware-config logging microkernels-all microparams-init)
+  ADD_TEST(NAME s32-vorc-test COMMAND s32-vorc-test)
+
   ADD_EXECUTABLE(f16-vcmul-test test/f16-vcmul.cc)
   SET_TARGET_PROPERTIES(f16-vcmul-test PROPERTIES CXX_EXTENSIONS YES)
   TARGET_INCLUDE_DIRECTORIES(f16-vcmul-test PRIVATE include src test)
diff --git a/cmake/gen/avx2_microkernels.cmake b/cmake/gen/avx2_microkernels.cmake
index d165cbf3e39..5d47d8defcf 100644
--- a/cmake/gen/avx2_microkernels.cmake
+++ b/cmake/gen/avx2_microkernels.cmake
@@ -570,6 +570,8 @@ SET(ALL_AVX2_MICROKERNEL_SRCS
   src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c
   src/s32-vmul/gen/s32-vmul-avx2.c
   src/s32-vmul/gen/s32-vmulc-avx2.c
+  src/s32-vor/gen/s32-vor-avx2.c
+  src/s32-vor/gen/s32-vorc-avx2.c
   src/x8-lut/gen/x8-lut-avx2-u32.c
   src/x8-lut/gen/x8-lut-avx2-u64.c
   src/x8-lut/gen/x8-lut-avx2-u96.c
diff --git a/cmake/gen/avx512f_microkernels.cmake b/cmake/gen/avx512f_microkernels.cmake
index 099ec1a04cb..27be19c1338 100644
--- a/cmake/gen/avx512f_microkernels.cmake
+++ b/cmake/gen/avx512f_microkernels.cmake
@@ -311,5 +311,7 @@ SET(ALL_AVX512F_MICROKERNEL_SRCS
   src/math/f32-sqrt-avx512f-nr2fma.c
   src/s32-vmul/gen/s32-vmul-avx512f.c
   src/s32-vmul/gen/s32-vmulc-avx512f.c
+  src/s32-vor/gen/s32-vor-avx512f.c
+  src/s32-vor/gen/s32-vorc-avx512f.c
   src/x32-packw/gen/x32-packw-x16-gemm-goi-avx512f-u4-prfm.c
   src/x32-packw/gen/x32-packw-x16-gemm-goi-avx512f-u4.c)
diff --git a/cmake/gen/neon_microkernels.cmake b/cmake/gen/neon_microkernels.cmake
index 407389db353..e01a30d2dbb 100644
--- a/cmake/gen/neon_microkernels.cmake
+++ b/cmake/gen/neon_microkernels.cmake
@@ -894,6 +894,8 @@ SET(ALL_NEON_MICROKERNEL_SRCS
   src/s16-window/gen/s16-window-shift15-neon-u32.c
   src/s32-vmul/gen/s32-vmul-neon.c
   src/s32-vmul/gen/s32-vmulc-neon.c
+  src/s32-vor/gen/s32-vor-neon.c
+  src/s32-vor/gen/s32-vorc-neon.c
   src/u8-ibilinear/gen/u8-ibilinear-neon-c8.c
   src/u8-ibilinear/gen/u8-ibilinear-neon-c16.c
   src/u8-maxpool/u8-maxpool-9p8x-minmax-neon-c16.c
diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake
index ebea43802e4..559d1a740d7 100644
--- a/cmake/gen/scalar_microkernels.cmake
+++ b/cmake/gen/scalar_microkernels.cmake
@@ -1052,6 +1052,8 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
   src/s16-window/gen/s16-window-scalar-u4.c
   src/s32-vmul/gen/s32-vmul-scalar.c
   src/s32-vmul/gen/s32-vmulc-scalar.c
+  src/s32-vor/gen/s32-vor-scalar.c
+  src/s32-vor/gen/s32-vorc-scalar.c
   src/u8-ibilinear/gen/u8-ibilinear-scalar-c1.c
   src/u8-ibilinear/gen/u8-ibilinear-scalar-c2.c
   src/u8-ibilinear/gen/u8-ibilinear-scalar-c4.c
diff --git a/cmake/gen/sse41_microkernels.cmake b/cmake/gen/sse41_microkernels.cmake
index 6f6383f3ed8..cec3a788bd4 100644
--- a/cmake/gen/sse41_microkernels.cmake
+++ b/cmake/gen/sse41_microkernels.cmake
@@ -381,5 +381,7 @@ SET(ALL_SSE41_MICROKERNEL_SRCS
   src/s8-vclamp/s8-vclamp-sse41-u64.c
   src/s32-vmul/gen/s32-vmul-sse41.c
   src/s32-vmul/gen/s32-vmulc-sse41.c
+  src/s32-vor/gen/s32-vor-sse41.c
+  src/s32-vor/gen/s32-vorc-sse41.c
   src/u8-ibilinear/gen/u8-ibilinear-sse41-c8.c
   src/u8-ibilinear/gen/u8-ibilinear-sse41-c16.c)
diff --git a/cmake/gen/wasmsimd_microkernels.cmake b/cmake/gen/wasmsimd_microkernels.cmake
index 1f3ea9da1b8..d58738bfaa9 100644
--- a/cmake/gen/wasmsimd_microkernels.cmake
+++ b/cmake/gen/wasmsimd_microkernels.cmake
@@ -1190,6 +1190,8 @@ SET(ALL_WASMSIMD_MICROKERNEL_SRCS
   src/s8-vclamp/s8-vclamp-wasmsimd-u64.c
   src/s32-vmul/gen/s32-vmul-wasmsimd.c
   src/s32-vmul/gen/s32-vmulc-wasmsimd.c
+  src/s32-vor/gen/s32-vor-wasmsimd.c
+  src/s32-vor/gen/s32-vorc-wasmsimd.c
   src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c8.c
   src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c16.c
   src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-mul32-c8.c
diff --git a/gen/avx2_microkernels.bzl b/gen/avx2_microkernels.bzl
index 05ed2b08c81..0bfec2769cd 100644
--- a/gen/avx2_microkernels.bzl
+++ b/gen/avx2_microkernels.bzl
@@ -566,6 +566,8 @@ ALL_AVX2_MICROKERNEL_SRCS = [
     "src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c",
     "src/s32-vmul/gen/s32-vmul-avx2.c",
     "src/s32-vmul/gen/s32-vmulc-avx2.c",
+    "src/s32-vor/gen/s32-vor-avx2.c",
+    "src/s32-vor/gen/s32-vorc-avx2.c",
     "src/x8-lut/gen/x8-lut-avx2-u32.c",
     "src/x8-lut/gen/x8-lut-avx2-u64.c",
     "src/x8-lut/gen/x8-lut-avx2-u96.c",
diff --git a/gen/avx512f_microkernels.bzl b/gen/avx512f_microkernels.bzl
index 407c7a31eed..af40f9844ab 100644
--- a/gen/avx512f_microkernels.bzl
+++ b/gen/avx512f_microkernels.bzl
@@ -307,6 +307,8 @@ ALL_AVX512F_MICROKERNEL_SRCS = [
     "src/math/f32-sqrt-avx512f-nr2fma.c",
     "src/s32-vmul/gen/s32-vmul-avx512f.c",
     "src/s32-vmul/gen/s32-vmulc-avx512f.c",
+    "src/s32-vor/gen/s32-vor-avx512f.c",
+    "src/s32-vor/gen/s32-vorc-avx512f.c",
     "src/x32-packw/gen/x32-packw-x16-gemm-goi-avx512f-u4-prfm.c",
     "src/x32-packw/gen/x32-packw-x16-gemm-goi-avx512f-u4.c",
 ]
diff --git a/gen/neon_microkernels.bzl b/gen/neon_microkernels.bzl
index 4c61e0611b6..fce1853765d 100644
--- a/gen/neon_microkernels.bzl
+++ b/gen/neon_microkernels.bzl
@@ -890,6 +890,8 @@ ALL_NEON_MICROKERNEL_SRCS = [
     "src/s16-window/gen/s16-window-shift15-neon-u32.c",
     "src/s32-vmul/gen/s32-vmul-neon.c",
     "src/s32-vmul/gen/s32-vmulc-neon.c",
+    "src/s32-vor/gen/s32-vor-neon.c",
+    "src/s32-vor/gen/s32-vorc-neon.c",
     "src/u8-ibilinear/gen/u8-ibilinear-neon-c8.c",
     "src/u8-ibilinear/gen/u8-ibilinear-neon-c16.c",
     "src/u8-maxpool/u8-maxpool-9p8x-minmax-neon-c16.c",
diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl
index 8e97dde5c94..2675f6e2aa9 100644
--- a/gen/scalar_microkernels.bzl
+++ b/gen/scalar_microkernels.bzl
@@ -1048,6 +1048,8 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
     "src/s16-window/gen/s16-window-scalar-u4.c",
     "src/s32-vmul/gen/s32-vmul-scalar.c",
     "src/s32-vmul/gen/s32-vmulc-scalar.c",
+    "src/s32-vor/gen/s32-vor-scalar.c",
+    "src/s32-vor/gen/s32-vorc-scalar.c",
     "src/u8-ibilinear/gen/u8-ibilinear-scalar-c1.c",
     "src/u8-ibilinear/gen/u8-ibilinear-scalar-c2.c",
     "src/u8-ibilinear/gen/u8-ibilinear-scalar-c4.c",
diff --git a/gen/sse41_microkernels.bzl b/gen/sse41_microkernels.bzl
index 17251c8430e..8261ca439e7 100644
--- a/gen/sse41_microkernels.bzl
+++ b/gen/sse41_microkernels.bzl
@@ -377,6 +377,8 @@ ALL_SSE41_MICROKERNEL_SRCS = [
     "src/s8-vclamp/s8-vclamp-sse41-u64.c",
     "src/s32-vmul/gen/s32-vmul-sse41.c",
     "src/s32-vmul/gen/s32-vmulc-sse41.c",
+    "src/s32-vor/gen/s32-vor-sse41.c",
+    "src/s32-vor/gen/s32-vorc-sse41.c",
     "src/u8-ibilinear/gen/u8-ibilinear-sse41-c8.c",
     "src/u8-ibilinear/gen/u8-ibilinear-sse41-c16.c",
 ]
diff --git a/gen/wasmsimd_microkernels.bzl b/gen/wasmsimd_microkernels.bzl
index 061d7d6d818..a80b74b1a36 100644
--- a/gen/wasmsimd_microkernels.bzl
+++ b/gen/wasmsimd_microkernels.bzl
@@ -1186,6 +1186,8 @@ ALL_WASMSIMD_MICROKERNEL_SRCS = [
     "src/s8-vclamp/s8-vclamp-wasmsimd-u64.c",
     "src/s32-vmul/gen/s32-vmul-wasmsimd.c",
     "src/s32-vmul/gen/s32-vmulc-wasmsimd.c",
+    "src/s32-vor/gen/s32-vor-wasmsimd.c",
+    "src/s32-vor/gen/s32-vorc-wasmsimd.c",
     "src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c8.c",
     "src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c16.c",
     "src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-mul32-c8.c",
diff --git a/include/xnnpack.h b/include/xnnpack.h
index c3b81bf619d..913db41c1b2 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -1789,6 +1789,22 @@ enum xnn_status xnn_define_negate(
   uint32_t output_id,
   uint32_t flags);
 
+/// Define a Bitwsie OR Node and add it to a Subgraph.
+///
+/// The OR node peforms bitwise OR between first and second input.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be defined in the @a subgraph.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor.
+/// @param flags - binary features of the OR Node. No supported flags are currently defined.
+enum xnn_status xnn_define_or(
+  xnn_subgraph_t subgraph,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
 /// Define a Sigmoid Node and add it to a Subgraph.
 ///
 /// @param subgraph - a Subgraph object that will own the created Node.
@@ -5437,6 +5453,24 @@ enum xnn_status xnn_run_negate_nc_f32(
   uint32_t flags,
   pthreadpool_t threadpool);
 
+enum xnn_status xnn_create_or_nd_s32(
+  uint32_t flags,
+  xnn_operator_t* or_op_out);
+
+enum xnn_status xnn_reshape_or_nd_s32(
+  xnn_operator_t or_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_or_nd_s32(
+  xnn_operator_t or_op,
+  const int32_t* input1,
+  const int32_t* input2,
+  int32_t* output);
+
 enum xnn_status xnn_create_prelu_nc_f16(
   size_t input_channels,
   size_t slope_channels,
diff --git a/scripts/generate-s32-vor.sh b/scripts/generate-s32-vor.sh
new file mode 100755
index 00000000000..583edcc7c0c
--- /dev/null
+++ b/scripts/generate-s32-vor.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+# Copyright 2024 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+##################################### SIMD VOR #####################################
+tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=scalar -D BATCH_TILES=1,2,4,8  -o src/s32-vor/gen/s32-vor-scalar.c &
+tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=sse41 -D BATCH_TILES=4,8,12,16  -o src/s32-vor/gen/s32-vor-sse41.c &
+tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=wasmsimd -D BATCH_TILES=4,8,12,16  -o src/s32-vor/gen/s32-vor-wasmsimd.c &
+tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=neon -D BATCH_TILES=4,8,12,16  -o src/s32-vor/gen/s32-vor-neon.c &
+tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=avx2 -D BATCH_TILES=8,16,24,32  -o src/s32-vor/gen/s32-vor-avx2.c &
+tools/xngen src/s32-vor/s32-vor.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64  -o src/s32-vor/gen/s32-vor-avx512f.c &
+
+##################################### SIMD VORC #####################################
+tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=scalar -D BATCH_TILES=1,2,4,8  -o src/s32-vor/gen/s32-vorc-scalar.c &
+tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=sse41 -D BATCH_TILES=4,8,12,16  -o src/s32-vor/gen/s32-vorc-sse41.c &
+tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=wasmsimd -D BATCH_TILES=4,8,12,16  -o src/s32-vor/gen/s32-vorc-wasmsimd.c &
+tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=neon -D BATCH_TILES=4,8,12,16  -o src/s32-vor/gen/s32-vorc-neon.c &
+tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=avx2 -D BATCH_TILES=8,16,24,32  -o src/s32-vor/gen/s32-vorc-avx2.c &
+tools/xngen src/s32-vor/s32-vorc.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64  -o src/s32-vor/gen/s32-vorc-avx512f.c &
+
+wait
diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
index 7af7fcdb3c8..b1ea04f9134 100755
--- a/scripts/generate-tests.sh
+++ b/scripts/generate-tests.sh
@@ -194,6 +194,9 @@ tools/generate-vbinary-test.py --tester VBinaryCMicrokernelTester --spec test/qu
 tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester  --spec test/s32-vmul.yaml   --output test/s32-vmul.cc &
 tools/generate-vbinary-test.py --tester VBinaryCMicrokernelTester --spec test/s32-vmulc.yaml    --output test/s32-vmulc.cc &
 
+tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester  --spec test/s32-vor.yaml   --output test/s32-vor.cc &
+tools/generate-vbinary-test.py --tester VBinaryCMicrokernelTester --spec test/s32-vorc.yaml    --output test/s32-vorc.cc &
+
 ### Tests for VUnary micro-kernels
 tools/generate-vunary-test.py --spec test/bf16-vabs.yaml --output test/bf16-vabs.cc &
 
diff --git a/src/amalgam/gen/avx2.c b/src/amalgam/gen/avx2.c
index 8cbc6e0e7bd..b1292da20cd 100644
--- a/src/amalgam/gen/avx2.c
+++ b/src/amalgam/gen/avx2.c
@@ -15832,3 +15832,103 @@ void xnn_s32_vmulc_ukernel__avx2_u16(
     xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
   }
 }
+
+void xnn_s32_vor_ukernel__avx2_u16(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 8);
+
+  for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    input_a += 16;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    input_b += 16;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 16;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__avx2_u16(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 8);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    input1 += 16;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 16;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/amalgam/gen/avx512f.c b/src/amalgam/gen/avx512f.c
index 986eb3b61a7..802f8bd08eb 100644
--- a/src/amalgam/gen/avx512f.c
+++ b/src/amalgam/gen/avx512f.c
@@ -2231,8 +2231,6 @@ void xnn_f32_rdsum_ukernel_7p7x__avx512f_c64(
     }
     for (int i = 0; i < channels >> 4; ++i) {
       vacc[i] = _mm512_add_ps(vo[i], vacc[i]);
-      vacc[i] = _mm512_max_ps(vacc[i], vmin);
-      vacc[i] = _mm512_min_ps(vacc[i], vmax);
     }
     for (int i = 0; i < channels >> 4; ++i) {
       _mm512_storeu_ps(output, vacc[i]); output += 16;
@@ -2240,8 +2238,6 @@ void xnn_f32_rdsum_ukernel_7p7x__avx512f_c64(
     if (remainder) {
       const size_t pos = num_full_chunks;
       __m512 vout = vacc[pos];
-      vout = _mm512_max_ps(vout, vmin);
-      vout = _mm512_min_ps(vout, vmax);
       vout = _mm512_maskz_add_ps(vmask, vout,  _mm512_maskz_loadu_ps(vmask, output));
       _mm512_mask_storeu_ps(output, vmask, vout);
     }
@@ -5557,3 +5553,103 @@ void xnn_s32_vmulc_ukernel__avx512f_u32(
     xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
   }
 }
+
+void xnn_s32_vor_ukernel__avx512f_u32(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 16);
+
+  for (; batch >= 32 * sizeof(int32_t); batch -= 32 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    input_a += 32;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    input_b += 32;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 32;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__avx512f_u32(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 16);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 32 * sizeof(int32_t); batch -= 32 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    input1 += 32;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 32;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/amalgam/gen/neon.c b/src/amalgam/gen/neon.c
index 494a9b769fe..b551daa3e62 100644
--- a/src/amalgam/gen/neon.c
+++ b/src/amalgam/gen/neon.c
@@ -30606,3 +30606,103 @@ void xnn_s32_vmulc_ukernel__neon_u8(
     xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
   }
 }
+
+void xnn_s32_vor_ukernel__neon_u8(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    input_a += 8;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    input_b += 8;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 8;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__neon_u8(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    input1 += 8;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 8;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/amalgam/gen/scalar.c b/src/amalgam/gen/scalar.c
index a1e98ae3fb9..cd828afa941 100644
--- a/src/amalgam/gen/scalar.c
+++ b/src/amalgam/gen/scalar.c
@@ -6,9 +6,9 @@
 // Auto-generated file. Do not edit!
 //   Generator: tools/update-microkernels.py -a
 
-#include <fp16/fp16.h>
 #include <assert.h>
 #include <float.h>
+#include <fp16/fp16.h>
 #include <fxdiv.h>
 #include <math.h>
 #include <stddef.h>
@@ -33351,3 +33351,87 @@ void xnn_s32_vmulc_ukernel__scalar_u2(
     output += xnn_simd_size_s32;
   }
 }
+
+void xnn_s32_vor_ukernel__scalar_u2(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 1);
+
+  for (; batch >= 2 * sizeof(int32_t); batch -= 2 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    input_a += 2;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    input_b += 2;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 2;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+}
+
+void xnn_s32_vorc_ukernel__scalar_u2(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 1);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 2 * sizeof(int32_t); batch -= 2 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    input1 += 2;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 2;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+}
diff --git a/src/amalgam/gen/sse41.c b/src/amalgam/gen/sse41.c
index fa0ad329bb2..85888c4b2b2 100644
--- a/src/amalgam/gen/sse41.c
+++ b/src/amalgam/gen/sse41.c
@@ -11478,3 +11478,103 @@ void xnn_s32_vmulc_ukernel__sse41_u8(
     xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
   }
 }
+
+void xnn_s32_vor_ukernel__sse41_u8(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    input_a += 8;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    input_b += 8;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 8;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__sse41_u8(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    input1 += 8;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 8;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/amalgam/gen/wasmsimd.c b/src/amalgam/gen/wasmsimd.c
index cb9daa3efe9..e0902309215 100644
--- a/src/amalgam/gen/wasmsimd.c
+++ b/src/amalgam/gen/wasmsimd.c
@@ -40917,3 +40917,117 @@ void xnn_s32_vmulc_ukernel__wasmsimd_u16(
     xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
   }
 }
+
+void xnn_s32_vor_ukernel__wasmsimd_u16(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32);
+    input_a += 16;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32));
+    input_b += 16;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 16;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__wasmsimd_u16(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32));
+    input1 += 16;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 16;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c
index d695fdce271..ab768d8668e 100644
--- a/src/configs/binary-elementwise-config.c
+++ b/src/configs/binary-elementwise-config.c
@@ -32,6 +32,7 @@ static struct xnn_binary_elementwise_config f32_vsqrdiff_config = {0};
 
 
 static struct xnn_binary_elementwise_config s32_vmul_config = {0};
+static struct xnn_binary_elementwise_config s32_vor_config = {0};
 
 static struct xnn_binary_elementwise_config qs8_vadd_config = {0};
 static struct xnn_binary_elementwise_config qs8_vmul_config = {0};
@@ -55,6 +56,7 @@ XNN_INIT_ONCE_GUARD(f32_vmul);
 XNN_INIT_ONCE_GUARD(f32_vsub);
 XNN_INIT_ONCE_GUARD(f32_vsqrdiff);
 XNN_INIT_ONCE_GUARD(s32_vmul);
+XNN_INIT_ONCE_GUARD(s32_vor);
 XNN_INIT_ONCE_GUARD(qs8_vadd);
 XNN_INIT_ONCE_GUARD(qs8_vmul);
 XNN_INIT_ONCE_GUARD(qu8_vadd);
@@ -556,6 +558,61 @@ static void init_s32_vmul_config(void) {
   #endif
 }
 
+static void init_s32_vor_config(void) {
+  #if XNN_ARCH_ARM
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->use_arm_neon) {
+      s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__neon_u8;
+      s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__neon_u8;
+      s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__neon_u8;
+      s32_vor_config.linear.element_tile = 8;
+    }
+    else if (!XNN_PLATFORM_MOBILE) {
+      s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__scalar_u2;
+      s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__scalar_u2;
+      s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__scalar_u2;
+      s32_vor_config.linear.element_tile = 2;
+    }
+  #elif XNN_ARCH_ARM64
+    s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__neon_u8;
+    s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__neon_u8;
+    s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__neon_u8;
+    s32_vor_config.linear.element_tile = 8;
+  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
+      s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__avx512f_u32;
+      s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__avx512f_u32;
+      s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__avx512f_u32;
+      s32_vor_config.linear.element_tile = 32;
+    }
+    else if (hardware_config->use_x86_avx2) {
+      s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__avx2_u16;
+      s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__avx2_u16;
+      s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__avx2_u16;
+      s32_vor_config.linear.element_tile = 16;
+    }
+    else {
+      s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__sse41_u8;
+      s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__sse41_u8;
+      s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__sse41_u8;
+      s32_vor_config.linear.element_tile = 8;
+    }
+  #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+    s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__wasmsimd_u16;
+    s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__wasmsimd_u16;
+    s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__wasmsimd_u16;
+    s32_vor_config.linear.element_tile = 16;
+  #else
+    s32_vor_config.linear.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vor_ukernel__scalar_u2;
+    s32_vor_config.linear.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__scalar_u2;
+    s32_vor_config.linear.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_s32_vorc_ukernel__scalar_u2;
+    s32_vor_config.linear.element_tile = 2;
+  #endif
+}
+
 static void init_f32_vdiv_config(void) {
   #if XNN_ARCH_ARM
     const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
@@ -1400,6 +1457,15 @@ const struct xnn_binary_elementwise_config* xnn_init_s32_vmul_config() {
   return &s32_vmul_config;
 }
 
+const struct xnn_binary_elementwise_config* xnn_init_s32_vor_config() {
+  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  if (hardware_config == NULL) {
+    return NULL;
+  }
+  XNN_INIT_ONCE(s32_vor);
+  return &s32_vor_config;
+}
+
 const struct xnn_binary_elementwise_config* xnn_init_f32_vdiv_config() {
   const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
   if (hardware_config == NULL) {
diff --git a/src/enums/node-type.c b/src/enums/node-type.c
index 0db7e426904..0593d1201d1 100644
--- a/src/enums/node-type.c
+++ b/src/enums/node-type.c
@@ -13,10 +13,10 @@
 #include "xnnpack/node-type.h"
 
 #if XNN_LOG_LEVEL > 0
-static const uint16_t offset[62] = {
+static const uint16_t offset[63] = {
   0, 8, 12, 17, 35, 54, 71, 93, 101, 107, 120, 133, 146, 159, 167, 182, 187, 197, 214, 232, 257, 264, 268, 272, 284,
-  296, 308, 314, 330, 353, 358, 384, 410, 432, 454, 464, 468, 479, 494, 503, 512, 522, 529, 535, 558, 569, 574, 603,
-  611, 619, 637, 644, 656, 675, 695, 707, 722, 748, 761, 778, 787, 792
+  296, 308, 314, 330, 353, 358, 384, 410, 432, 454, 464, 468, 479, 494, 503, 512, 522, 529, 532, 538, 561, 572, 577,
+  606, 614, 622, 640, 647, 659, 678, 698, 710, 725, 751, 764, 781, 790, 795
 };
 
 static const char data[] =
@@ -62,6 +62,7 @@ static const char data[] =
   "Minimum2\0"
   "Multiply2\0"
   "Negate\0"
+  "OR\0"
   "PReLU\0"
   "Reciprocal Square Root\0"
   "Reshape 2D\0"
diff --git a/src/enums/node-type.yaml b/src/enums/node-type.yaml
index 7c825e1e28f..06a9979d297 100644
--- a/src/enums/node-type.yaml
+++ b/src/enums/node-type.yaml
@@ -89,6 +89,8 @@
   string: "Multiply2"
 - name: xnn_node_type_negate
   string: "Negate"
+- name: xnn_node_type_or
+  string: "OR"
 - name: xnn_node_type_prelu
   string: "PReLU"
 - name: xnn_node_type_reciprocal_square_root
diff --git a/src/enums/operator-type.c b/src/enums/operator-type.c
index a01bfeb4558..53a64f488d0 100644
--- a/src/enums/operator-type.c
+++ b/src/enums/operator-type.c
@@ -12,16 +12,16 @@
 
 #include "xnnpack/operator-type.h"
 
-static const uint16_t offset[169] = {
+static const uint16_t offset[170] = {
   0, 8, 22, 36, 50, 64, 78, 92, 119, 147, 175, 203, 230, 257, 289, 321, 364, 382, 400, 425, 451, 467, 483, 498, 513,
   535, 558, 581, 604, 627, 650, 673, 696, 719, 742, 760, 783, 806, 830, 848, 871, 895, 919, 943, 967, 1002, 1037, 1061,
   1085, 1109, 1123, 1138, 1153, 1173, 1199, 1225, 1262, 1288, 1318, 1344, 1376, 1408, 1434, 1461, 1488, 1505, 1522,
   1556, 1590, 1604, 1618, 1632, 1646, 1662, 1678, 1704, 1730, 1762, 1794, 1831, 1868, 1905, 1942, 1979, 2016, 2053,
   2079, 2111, 2137, 2152, 2186, 2220, 2254, 2288, 2322, 2356, 2386, 2416, 2436, 2456, 2477, 2498, 2519, 2540, 2554,
-  2578, 2602, 2625, 2648, 2666, 2684, 2699, 2714, 2732, 2750, 2769, 2788, 2807, 2826, 2845, 2862, 2879, 2895, 2911,
-  2944, 2977, 3005, 3033, 3061, 3089, 3116, 3143, 3160, 3177, 3218, 3259, 3277, 3295, 3313, 3331, 3346, 3362, 3378,
-  3396, 3414, 3432, 3458, 3485, 3512, 3529, 3546, 3568, 3590, 3619, 3648, 3667, 3686, 3705, 3724, 3739, 3754, 3769,
-  3784, 3803, 3823, 3843, 3863, 3884, 3905
+  2578, 2602, 2625, 2648, 2666, 2684, 2699, 2714, 2732, 2750, 2769, 2788, 2807, 2826, 2845, 2862, 2879, 2892, 2908,
+  2924, 2957, 2990, 3018, 3046, 3074, 3102, 3129, 3156, 3173, 3190, 3231, 3272, 3290, 3308, 3326, 3344, 3359, 3375,
+  3391, 3409, 3427, 3445, 3471, 3498, 3525, 3542, 3559, 3581, 3603, 3632, 3661, 3680, 3699, 3718, 3737, 3752, 3767,
+  3782, 3797, 3816, 3836, 3856, 3876, 3897, 3918
 };
 
 static const char data[] =
@@ -146,6 +146,7 @@ static const char data[] =
   "Multiply (ND, S32)\0"
   "Negate (NC, F16)\0"
   "Negate (NC, F32)\0"
+  "OR (ND, S32)\0"
   "PReLU (NC, F16)\0"
   "PReLU (NC, F32)\0"
   "Reciprocal Square Root (NC, F16)\0"
diff --git a/src/enums/operator-type.yaml b/src/enums/operator-type.yaml
index 7b153359692..b4230132720 100644
--- a/src/enums/operator-type.yaml
+++ b/src/enums/operator-type.yaml
@@ -247,6 +247,8 @@
   string: "Negate (NC, F16)"
 - name: xnn_operator_type_negate_nc_f32
   string: "Negate (NC, F32)"
+- name: xnn_operator_type_or_nd_s32
+  string: "OR (ND, S32)"
 - name: xnn_operator_type_prelu_nc_f16
   string: "PReLU (NC, F16)"
 - name: xnn_operator_type_prelu_nc_f32
diff --git a/src/operators/binary-elementwise-nd.c b/src/operators/binary-elementwise-nd.c
index 59261d83776..4a43b38d96e 100644
--- a/src/operators/binary-elementwise-nd.c
+++ b/src/operators/binary-elementwise-nd.c
@@ -782,6 +782,28 @@ enum xnn_status xnn_create_multiply_nd_s32(
     multiply_op_out);
 }
 
+enum xnn_status xnn_create_or_nd_s32(
+    uint32_t flags,
+    xnn_operator_t* or_op_out)
+{
+  const struct xnn_binary_elementwise_config* s32_or_config = xnn_init_s32_vor_config();
+  if (s32_or_config == NULL) {
+    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
+      xnn_operator_type_to_string(xnn_operator_type_or_nd_s32));
+    return xnn_status_unsupported_hardware;
+  }
+
+  union xnn_s32_default_params params;
+
+  return create_binary_elementwise_nd(
+    flags,
+    &params,
+    &params,
+    sizeof(params),
+    xnn_operator_type_or_nd_s32,
+    &s32_or_config->linear,
+    or_op_out);
+}
 
 enum xnn_status xnn_create_subtract_nd_f16(
     float output_min,
@@ -1517,6 +1539,26 @@ enum xnn_status xnn_reshape_multiply_nd_s32(
     threadpool);
 }
 
+
+enum xnn_status xnn_reshape_or_nd_s32(
+    xnn_operator_t or_op,
+    size_t num_input1_dims,
+    const size_t* input1_shape,
+    size_t num_input2_dims,
+    const size_t* input2_shape,
+    pthreadpool_t threadpool)
+{
+
+  return reshape_binary_elementwise_nd(
+    or_op, xnn_operator_type_or_nd_s32,
+    num_input1_dims, input1_shape,
+    num_input2_dims, input2_shape,
+    /*log2_element_size=*/XNN_LOG2_SIZEOF_INT32_T,
+    &or_op->params.s32_default, sizeof(or_op->params.s32_default),
+    &or_op->params.s32_default, sizeof(or_op->params.s32_default),
+    threadpool);
+}
+
 enum xnn_status xnn_reshape_subtract_nd_f16(
     xnn_operator_t subtract_op,
     size_t num_input1_dims,
@@ -1836,6 +1878,17 @@ enum xnn_status xnn_setup_multiply_nd_s32(
     input1, input2, output);
 }
 
+enum xnn_status xnn_setup_or_nd_s32(
+    xnn_operator_t or_op,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output)
+{
+  return setup_binary_elementwise_nd(
+    or_op, xnn_operator_type_or_nd_s32,
+    input1, input2, output);
+}
+
 
 enum xnn_status xnn_setup_subtract_nd_f32(
     xnn_operator_t subtract_op,
diff --git a/src/s32-vor/gen/s32-vor-avx2.c b/src/s32-vor/gen/s32-vor-avx2.c
new file mode 100644
index 00000000000..4b053298274
--- /dev/null
+++ b/src/s32-vor/gen/s32-vor-avx2.c
@@ -0,0 +1,227 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s32-vor/s32-vor.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-avx2.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s32_vor_ukernel__avx2_u8(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 8);
+
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__avx2_u16(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 8);
+
+  for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    input_a += 16;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    input_b += 16;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 16;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__avx2_u24(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 8);
+
+  for (; batch >= 24 * sizeof(int32_t); batch -= 24 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    input_a += 24;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    input_b += 24;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    output += 24;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__avx2_u32(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 8);
+
+  for (; batch >= 32 * sizeof(int32_t); batch -= 32 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32);
+    input_a += 32;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32));
+    input_b += 32;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 32;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/s32-vor/gen/s32-vor-avx512f.c b/src/s32-vor/gen/s32-vor-avx512f.c
new file mode 100644
index 00000000000..0a49aa6a53e
--- /dev/null
+++ b/src/s32-vor/gen/s32-vor-avx512f.c
@@ -0,0 +1,227 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s32-vor/s32-vor.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-avx512f.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s32_vor_ukernel__avx512f_u16(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 16);
+
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__avx512f_u32(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 16);
+
+  for (; batch >= 32 * sizeof(int32_t); batch -= 32 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    input_a += 32;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    input_b += 32;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 32;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__avx512f_u48(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 16);
+
+  for (; batch >= 48 * sizeof(int32_t); batch -= 48 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    input_a += 48;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    input_b += 48;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    output += 48;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__avx512f_u64(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 16);
+
+  for (; batch >= 64 * sizeof(int32_t); batch -= 64 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32);
+    input_a += 64;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32));
+    input_b += 64;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 64;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/s32-vor/gen/s32-vor-neon.c b/src/s32-vor/gen/s32-vor-neon.c
new file mode 100644
index 00000000000..3e1346fd599
--- /dev/null
+++ b/src/s32-vor/gen/s32-vor-neon.c
@@ -0,0 +1,227 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s32-vor/s32-vor.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-neon.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s32_vor_ukernel__neon_u4(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__neon_u8(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    input_a += 8;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    input_b += 8;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 8;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__neon_u12(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= 12 * sizeof(int32_t); batch -= 12 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    input_a += 12;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    input_b += 12;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    output += 12;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__neon_u16(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32);
+    input_a += 16;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32));
+    input_b += 16;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 16;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/s32-vor/gen/s32-vor-scalar.c b/src/s32-vor/gen/s32-vor-scalar.c
new file mode 100644
index 00000000000..090c3020c77
--- /dev/null
+++ b/src/s32-vor/gen/s32-vor-scalar.c
@@ -0,0 +1,211 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s32-vor/s32-vor.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-scalar.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s32_vor_ukernel__scalar_u1(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 1);
+
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+}
+
+void xnn_s32_vor_ukernel__scalar_u2(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 1);
+
+  for (; batch >= 2 * sizeof(int32_t); batch -= 2 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    input_a += 2;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    input_b += 2;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 2;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+}
+
+void xnn_s32_vor_ukernel__scalar_u4(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 1);
+
+  for (; batch >= 4 * sizeof(int32_t); batch -= 4 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32);
+    input_a += 4;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32));
+    input_b += 4;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 4;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+}
+
+void xnn_s32_vor_ukernel__scalar_u8(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 1);
+
+  for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_4 = xnn_loadu_s32(input_a + 4 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_5 = xnn_loadu_s32(input_a + 5 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_6 = xnn_loadu_s32(input_a + 6 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_7 = xnn_loadu_s32(input_a + 7 * xnn_simd_size_s32);
+    input_a += 8;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_4 = (xnn_loadu_s32(input_b + 4 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_5 = (xnn_loadu_s32(input_b + 5 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_6 = (xnn_loadu_s32(input_b + 6 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_7 = (xnn_loadu_s32(input_b + 7 * xnn_simd_size_s32));
+    input_b += 8;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3);
+    xnn_simd_s32_t vy_4 = xnn_or_s32(vin1_4, vin2_4);
+    xnn_simd_s32_t vy_5 = xnn_or_s32(vin1_5, vin2_5);
+    xnn_simd_s32_t vy_6 = xnn_or_s32(vin1_6, vin2_6);
+    xnn_simd_s32_t vy_7 = xnn_or_s32(vin1_7, vin2_7);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    xnn_storeu_s32(output + 4 * xnn_simd_size_s32, vy_4);
+    xnn_storeu_s32(output + 5 * xnn_simd_size_s32, vy_5);
+    xnn_storeu_s32(output + 6 * xnn_simd_size_s32, vy_6);
+    xnn_storeu_s32(output + 7 * xnn_simd_size_s32, vy_7);
+    output += 8;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+}
diff --git a/src/s32-vor/gen/s32-vor-sse41.c b/src/s32-vor/gen/s32-vor-sse41.c
new file mode 100644
index 00000000000..937b332f01a
--- /dev/null
+++ b/src/s32-vor/gen/s32-vor-sse41.c
@@ -0,0 +1,227 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s32-vor/s32-vor.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-sse41.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s32_vor_ukernel__sse41_u4(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__sse41_u8(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    input_a += 8;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    input_b += 8;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 8;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__sse41_u12(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= 12 * sizeof(int32_t); batch -= 12 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    input_a += 12;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    input_b += 12;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    output += 12;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__sse41_u16(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32);
+    input_a += 16;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32));
+    input_b += 16;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 16;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/s32-vor/gen/s32-vor-wasmsimd.c b/src/s32-vor/gen/s32-vor-wasmsimd.c
new file mode 100644
index 00000000000..87b7fab3cb7
--- /dev/null
+++ b/src/s32-vor/gen/s32-vor-wasmsimd.c
@@ -0,0 +1,227 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s32-vor/s32-vor.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-wasmsimd.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s32_vor_ukernel__wasmsimd_u4(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__wasmsimd_u8(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    input_a += 8;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    input_b += 8;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 8;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__wasmsimd_u12(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= 12 * sizeof(int32_t); batch -= 12 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    input_a += 12;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    input_b += 12;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    output += 12;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vor_ukernel__wasmsimd_u16(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input_b != NULL);
+  assert(input_a != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
+    xnn_simd_s32_t vin1_0 = xnn_loadu_s32(input_a);
+    xnn_simd_s32_t vin1_1 = xnn_loadu_s32(input_a + 1 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_2 = xnn_loadu_s32(input_a + 2 * xnn_simd_size_s32);
+    xnn_simd_s32_t vin1_3 = xnn_loadu_s32(input_a + 3 * xnn_simd_size_s32);
+    input_a += 16;
+
+    xnn_simd_s32_t vin2_0 = xnn_loadu_s32(input_b);
+    xnn_simd_s32_t vin2_1 = (xnn_loadu_s32(input_b + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_2 = (xnn_loadu_s32(input_b + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin2_3 = (xnn_loadu_s32(input_b + 3 * xnn_simd_size_s32));
+    input_b += 16;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2_0);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2_1);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2_2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2_3);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 16;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+    input_a += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+    input_b += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/s32-vor/gen/s32-vorc-avx2.c b/src/s32-vor/gen/s32-vorc-avx2.c
new file mode 100644
index 00000000000..6919ee0ee1f
--- /dev/null
+++ b/src/s32-vor/gen/s32-vorc-avx2.c
@@ -0,0 +1,203 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s32-vor/s32-vorc.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-avx2.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s32_vorc_ukernel__avx2_u8(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 8);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__avx2_u16(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 8);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    input1 += 16;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 16;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__avx2_u24(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 8);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 24 * sizeof(int32_t); batch -= 24 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    input1 += 24;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    output += 24;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__avx2_u32(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 8);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 32 * sizeof(int32_t); batch -= 32 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32));
+    input1 += 32;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 32;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/s32-vor/gen/s32-vorc-avx512f.c b/src/s32-vor/gen/s32-vorc-avx512f.c
new file mode 100644
index 00000000000..ecf4a2863ed
--- /dev/null
+++ b/src/s32-vor/gen/s32-vorc-avx512f.c
@@ -0,0 +1,203 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s32-vor/s32-vorc.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-avx512f.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s32_vorc_ukernel__avx512f_u16(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 16);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__avx512f_u32(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 16);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 32 * sizeof(int32_t); batch -= 32 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    input1 += 32;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 32;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__avx512f_u48(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 16);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 48 * sizeof(int32_t); batch -= 48 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    input1 += 48;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    output += 48;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__avx512f_u64(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 16);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 64 * sizeof(int32_t); batch -= 64 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32));
+    input1 += 64;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 64;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/s32-vor/gen/s32-vorc-neon.c b/src/s32-vor/gen/s32-vorc-neon.c
new file mode 100644
index 00000000000..6729f581d9f
--- /dev/null
+++ b/src/s32-vor/gen/s32-vorc-neon.c
@@ -0,0 +1,203 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s32-vor/s32-vorc.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-neon.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s32_vorc_ukernel__neon_u4(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__neon_u8(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    input1 += 8;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 8;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__neon_u12(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 12 * sizeof(int32_t); batch -= 12 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    input1 += 12;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    output += 12;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__neon_u16(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32));
+    input1 += 16;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 16;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/s32-vor/gen/s32-vorc-scalar.c b/src/s32-vor/gen/s32-vorc-scalar.c
new file mode 100644
index 00000000000..e60eadd40e3
--- /dev/null
+++ b/src/s32-vor/gen/s32-vorc-scalar.c
@@ -0,0 +1,190 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s32-vor/s32-vorc.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-scalar.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s32_vorc_ukernel__scalar_u1(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 1);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+}
+
+void xnn_s32_vorc_ukernel__scalar_u2(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 1);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 2 * sizeof(int32_t); batch -= 2 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    input1 += 2;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 2;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+}
+
+void xnn_s32_vorc_ukernel__scalar_u4(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 1);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 4 * sizeof(int32_t); batch -= 4 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32));
+    input1 += 4;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 4;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+}
+
+void xnn_s32_vorc_ukernel__scalar_u8(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 1);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_4 = (xnn_loadu_s32(input1 + 4 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_5 = (xnn_loadu_s32(input1 + 5 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_6 = (xnn_loadu_s32(input1 + 6 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_7 = (xnn_loadu_s32(input1 + 7 * xnn_simd_size_s32));
+    input1 += 8;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2);
+    xnn_simd_s32_t vy_4 = xnn_or_s32(vin1_4, vin2);
+    xnn_simd_s32_t vy_5 = xnn_or_s32(vin1_5, vin2);
+    xnn_simd_s32_t vy_6 = xnn_or_s32(vin1_6, vin2);
+    xnn_simd_s32_t vy_7 = xnn_or_s32(vin1_7, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    xnn_storeu_s32(output + 4 * xnn_simd_size_s32, vy_4);
+    xnn_storeu_s32(output + 5 * xnn_simd_size_s32, vy_5);
+    xnn_storeu_s32(output + 6 * xnn_simd_size_s32, vy_6);
+    xnn_storeu_s32(output + 7 * xnn_simd_size_s32, vy_7);
+    output += 8;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+}
diff --git a/src/s32-vor/gen/s32-vorc-sse41.c b/src/s32-vor/gen/s32-vorc-sse41.c
new file mode 100644
index 00000000000..38013a5dbdb
--- /dev/null
+++ b/src/s32-vor/gen/s32-vorc-sse41.c
@@ -0,0 +1,203 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s32-vor/s32-vorc.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-sse41.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s32_vorc_ukernel__sse41_u4(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__sse41_u8(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    input1 += 8;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 8;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__sse41_u12(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 12 * sizeof(int32_t); batch -= 12 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    input1 += 12;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    output += 12;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__sse41_u16(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32));
+    input1 += 16;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 16;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/s32-vor/gen/s32-vorc-wasmsimd.c b/src/s32-vor/gen/s32-vorc-wasmsimd.c
new file mode 100644
index 00000000000..5707f68328d
--- /dev/null
+++ b/src/s32-vor/gen/s32-vorc-wasmsimd.c
@@ -0,0 +1,203 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s32-vor/s32-vorc.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-wasmsimd.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s32_vorc_ukernel__wasmsimd_u4(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__wasmsimd_u8(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 8 * sizeof(int32_t); batch -= 8 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    input1 += 8;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    output += 8;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__wasmsimd_u12(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 12 * sizeof(int32_t); batch -= 12 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    input1 += 12;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    output += 12;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
+
+void xnn_s32_vorc_ukernel__wasmsimd_u16(
+    size_t batch,
+    const int32_t* input1,
+    const int32_t* input2,
+    int32_t* output,
+    const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int32_t) == 0);
+  assert(input1 != NULL);
+  assert(input2 != NULL);
+  assert(output != NULL);
+  assert(xnn_simd_size_s32 == 4);
+
+  xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+  for (; batch >= 16 * sizeof(int32_t); batch -= 16 * sizeof(int32_t)) {
+
+    xnn_simd_s32_t vin1_0 = (xnn_loadu_s32(input1));
+    xnn_simd_s32_t vin1_1 = (xnn_loadu_s32(input1 + 1 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_2 = (xnn_loadu_s32(input1 + 2 * xnn_simd_size_s32));
+    xnn_simd_s32_t vin1_3 = (xnn_loadu_s32(input1 + 3 * xnn_simd_size_s32));
+    input1 += 16;
+
+    xnn_simd_s32_t vy_0 = xnn_or_s32(vin1_0, vin2);
+    xnn_simd_s32_t vy_1 = xnn_or_s32(vin1_1, vin2);
+    xnn_simd_s32_t vy_2 = xnn_or_s32(vin1_2, vin2);
+    xnn_simd_s32_t vy_3 = xnn_or_s32(vin1_3, vin2);
+
+    xnn_storeu_s32(output, vy_0);
+    xnn_storeu_s32(output + 1 * xnn_simd_size_s32, vy_1);
+    xnn_storeu_s32(output + 2 * xnn_simd_size_s32, vy_2);
+    xnn_storeu_s32(output + 3 * xnn_simd_size_s32, vy_3);
+    output += 16;
+  }
+  for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+    xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+    input1 += xnn_simd_size_s32;
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_storeu_s32(output, vy);
+    output += xnn_simd_size_s32;
+  }
+  if XNN_UNLIKELY(batch != 0) {
+    xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+    xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+    xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+  }
+}
diff --git a/src/s32-vor/s32-vor.c.in b/src/s32-vor/s32-vor.c.in
new file mode 100644
index 00000000000..53642151ce2
--- /dev/null
+++ b/src/s32-vor/s32-vor.c.in
@@ -0,0 +1,80 @@
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$BATCH_TILES = tuple(int(bt) for bt in BATCH_TILES.split(","))
+$SIMD_SIZE = BATCH_TILES[0]
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-${ARCH}.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+$for BATCH_TILE in BATCH_TILES:
+  $assert BATCH_TILE % SIMD_SIZE == 0
+  $assert BATCH_TILE >= SIMD_SIZE
+  $SIMD_TILE = BATCH_TILE // SIMD_SIZE
+
+  void xnn_s32_vor_ukernel__${ARCH}_u${BATCH_TILE}(
+      size_t batch,
+      const int32_t* input_a,
+      const int32_t* input_b,
+      int32_t* output,
+      const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+  {
+    assert(batch != 0);
+    assert(batch % sizeof(int32_t) == 0);
+    assert(input_b != NULL);
+    assert(input_a != NULL);
+    assert(output != NULL);
+    assert(xnn_simd_size_s32 == ${SIMD_SIZE});
+
+    $if SIMD_TILE > 1:
+      for (; batch >= ${BATCH_TILE} * sizeof(int32_t); batch -= ${BATCH_TILE} * sizeof(int32_t)) {
+        xnn_simd_s32_t vin1_${ABC[0]} = xnn_loadu_s32(input_a);
+        $for N in range(1, SIMD_TILE):
+          xnn_simd_s32_t vin1_${ABC[N]} = xnn_loadu_s32(input_a + ${N} * xnn_simd_size_s32);
+        input_a += ${BATCH_TILE};
+
+        xnn_simd_s32_t vin2_${ABC[0]} = xnn_loadu_s32(input_b);
+        $for N in range(1, SIMD_TILE):
+          xnn_simd_s32_t vin2_${ABC[N]} = (xnn_loadu_s32(input_b + ${N} * xnn_simd_size_s32));
+        input_b += ${BATCH_TILE};
+
+        $for N in range(0, SIMD_TILE):
+          xnn_simd_s32_t vy_${ABC[N]} = xnn_or_s32(vin1_${ABC[N]}, vin2_${ABC[N]});
+
+        xnn_storeu_s32(output, vy_${ABC[0]});
+        $for N in range(1, SIMD_TILE):
+          xnn_storeu_s32(output + ${N} * xnn_simd_size_s32, vy_${ABC[N]});
+        output += ${BATCH_TILE};
+      }
+    for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+      xnn_simd_s32_t vin1 = xnn_loadu_s32(input_a);
+      input_a += xnn_simd_size_s32;
+
+      xnn_simd_s32_t vin2 = xnn_loadu_s32(input_b);
+      input_b += xnn_simd_size_s32;
+
+      xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+      xnn_storeu_s32(output, vy);
+      output += xnn_simd_size_s32;
+    }
+    $if SIMD_SIZE > 1:
+      if XNN_UNLIKELY(batch != 0) {
+        xnn_simd_s32_t vin1 = xnn_load_tail_s32(input_a, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+        xnn_simd_s32_t vin2 = xnn_load_tail_s32(input_b, batch >> XNN_LOG2_SIZEOF_INT32_T);
+
+        xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+        xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+      }
+  }
diff --git a/src/s32-vor/s32-vorc.c.in b/src/s32-vor/s32-vorc.c.in
new file mode 100644
index 00000000000..fb433b92081
--- /dev/null
+++ b/src/s32-vor/s32-vorc.c.in
@@ -0,0 +1,73 @@
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$BATCH_TILES = tuple(int(bt) for bt in BATCH_TILES.split(","))
+$SIMD_SIZE = BATCH_TILES[0]
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xnnpack/simd/s32-${ARCH}.h"
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+
+$for BATCH_TILE in BATCH_TILES:
+  $assert BATCH_TILE % SIMD_SIZE == 0
+  $assert BATCH_TILE >= SIMD_SIZE
+  $SIMD_TILE = BATCH_TILE // SIMD_SIZE
+
+  void xnn_s32_vorc_ukernel__${ARCH}_u${BATCH_TILE}(
+      size_t batch,
+      const int32_t* input1,
+      const int32_t* input2,
+      int32_t* output,
+      const union xnn_s32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+  {
+    assert(batch != 0);
+    assert(batch % sizeof(int32_t) == 0);
+    assert(input1 != NULL);
+    assert(input2 != NULL);
+    assert(output != NULL);
+    assert(xnn_simd_size_s32 == ${SIMD_SIZE});
+
+    xnn_simd_s32_t vin2 = xnn_set1_s32(*input2);
+
+    $if SIMD_TILE > 1:
+      for (; batch >= ${BATCH_TILE} * sizeof(int32_t); batch -= ${BATCH_TILE} * sizeof(int32_t)) {
+
+        xnn_simd_s32_t vin1_${ABC[0]} = (xnn_loadu_s32(input1));
+        $for N in range(1, SIMD_TILE):
+          xnn_simd_s32_t vin1_${ABC[N]} = (xnn_loadu_s32(input1 + ${N} * xnn_simd_size_s32));
+        input1 += ${BATCH_TILE};
+
+        $for N in range(0, SIMD_TILE):
+          xnn_simd_s32_t vy_${ABC[N]} = xnn_or_s32(vin1_${ABC[N]}, vin2);
+
+        xnn_storeu_s32(output, vy_${ABC[0]});
+        $for N in range(1, SIMD_TILE):
+          xnn_storeu_s32(output + ${N} * xnn_simd_size_s32, vy_${ABC[N]});
+        output += ${BATCH_TILE};
+      }
+    for (; batch >= xnn_simd_bytes_s32; batch -= xnn_simd_bytes_s32) {
+      xnn_simd_s32_t vin1 = xnn_loadu_s32(input1);
+      input1 += xnn_simd_size_s32;
+
+      xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+      xnn_storeu_s32(output, vy);
+      output += xnn_simd_size_s32;
+    }
+    $if SIMD_SIZE > 1:
+      if XNN_UNLIKELY(batch != 0) {
+        xnn_simd_s32_t vin1 = (xnn_load_tail_s32(input1, batch >> XNN_LOG2_SIZEOF_INT32_T));
+
+        xnn_simd_s32_t vy = xnn_or_s32(vin1, vin2);
+
+        xnn_store_tail_s32(output, vy, batch >> XNN_LOG2_SIZEOF_INT32_T);
+      }
+  }
diff --git a/src/subgraph/or.c b/src/subgraph/or.c
new file mode 100644
index 00000000000..d16e3d8667b
--- /dev/null
+++ b/src/subgraph/or.c
@@ -0,0 +1,253 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "xnnpack.h"
+#include "xnnpack/common.h"
+#include "xnnpack/log.h"
+#include "xnnpack/node-type.h"
+#include "xnnpack/operator-type.h"
+#include "xnnpack/operator.h"
+#include "xnnpack/reshape-helpers.h"
+#include "xnnpack/subgraph-validation.h"
+#include "xnnpack/subgraph.h"
+#include "pthreadpool.h"
+
+static enum xnn_status create_or_operator(
+  const struct xnn_node* node,
+  const struct xnn_value* values,
+  size_t num_values,
+  struct xnn_operator_data* opdata,
+  struct xnn_code_cache* code_cache,
+  xnn_weights_cache_t weights_cache)
+{
+  assert(node->num_inputs == 2);
+  assert(node->num_outputs == 1);
+
+  enum xnn_status status;
+  switch (node->compute_type) {
+    case xnn_compute_type_s32:
+      status = xnn_create_or_nd_s32(
+        node->flags,
+        &opdata->operator_objects[0]);
+      break;
+    default:
+      XNN_UNREACHABLE;
+  }
+  return status;
+}
+
+static enum xnn_status reshape_or_operator(
+  struct xnn_operator_data* opdata,
+  struct xnn_value* values,
+  size_t num_values,
+  pthreadpool_t threadpool)
+{
+  const uint32_t input1_id = opdata->inputs[0];
+  assert(input1_id < num_values);
+  const uint32_t input2_id = opdata->inputs[1];
+  assert(input2_id < num_values);
+  const uint32_t output_id = opdata->outputs[0];
+  assert(output_id < num_values);
+
+  opdata->shape1.num_dims = values[input1_id].shape.num_dims;
+  opdata->shape2.num_dims = values[input2_id].shape.num_dims;
+  if (values[output_id].layout == xnn_layout_type_nchw) {
+    assert(values[input1_id].layout == xnn_layout_type_nchw);
+    assert(values[input2_id].layout == xnn_layout_type_nchw);
+    opdata->shape1.dim[0] = values[input1_id].shape.dim[0];
+    opdata->shape1.dim[1] = values[input1_id].shape.dim[values[input1_id].shape.num_dims - 1];
+    if (values[input1_id].shape.num_dims > 2) {
+      memcpy(&opdata->shape1.dim[2], &values[input1_id].shape.dim[1], (values[input1_id].shape.num_dims - 2) * sizeof(size_t));
+    }
+    opdata->shape2.dim[0] = values[input2_id].shape.dim[0];
+    opdata->shape2.dim[1] = values[input2_id].shape.dim[values[input2_id].shape.num_dims - 1];
+    if (values[input1_id].shape.num_dims > 2) {
+      memcpy(&opdata->shape2.dim[2], &values[input2_id].shape.dim[1], (values[input2_id].shape.num_dims - 2) * sizeof(size_t));
+    }
+  } else {
+    assert(values[output_id].layout == xnn_layout_type_nhwc);
+    assert(values[input1_id].layout == xnn_layout_type_nhwc);
+    assert(values[input2_id].layout == xnn_layout_type_nhwc);
+    memcpy(opdata->shape1.dim, values[input1_id].shape.dim, values[input1_id].shape.num_dims * sizeof(size_t));
+    memcpy(opdata->shape2.dim, values[input2_id].shape.dim, values[input2_id].shape.num_dims * sizeof(size_t));
+  }
+
+  // Handle scalars. Although the output shape is dimensionless, the reshape
+  // function must be passed a valid shape to prevent skipping the op.
+  if (opdata->shape1.num_dims == 0) {
+    opdata->shape1.num_dims = 1;
+    opdata->shape1.dim[0] = 1;
+  }
+  if (opdata->shape2.num_dims == 0) {
+    opdata->shape2.num_dims = 1;
+    opdata->shape2.dim[0] = 1;
+  }
+  const size_t old_workspace_size = opdata->workspace_size;
+  enum xnn_status status = xnn_status_invalid_state;
+  switch (opdata->operator_objects[0]->type) {
+    case xnn_operator_type_or_nd_s32:
+      status = xnn_reshape_or_nd_s32(
+        opdata->operator_objects[0],
+        opdata->shape1.num_dims,
+        opdata->shape1.dim,
+        opdata->shape2.num_dims,
+        opdata->shape2.dim,
+        threadpool);
+      break;
+    default:
+      XNN_UNREACHABLE;
+  }
+  if (status != xnn_status_success) {
+    return status;
+  }
+  return resize_binary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool);
+}
+
+static enum xnn_status setup_or_operator(
+  const struct xnn_operator_data* opdata,
+  const struct xnn_value* values,
+  size_t num_values,
+  pthreadpool_t threadpool)
+{
+  const uint32_t input1_id = opdata->inputs[0];
+  assert(input1_id != XNN_INVALID_VALUE_ID);
+  assert(input1_id < num_values);
+
+  const uint32_t input2_id = opdata->inputs[1];
+  assert(input2_id != XNN_INVALID_VALUE_ID);
+  assert(input2_id < num_values);
+
+  const uint32_t output_id = opdata->outputs[0];
+  assert(output_id != XNN_INVALID_VALUE_ID);
+  assert(output_id < num_values);
+
+  const struct xnn_value* input1_value = values + input1_id;
+  const void* input1_data = input1_value->data;
+  assert(input1_data != NULL);
+
+  const struct xnn_value* input2_value = values + input2_id;
+  const void* input2_data = input2_value->data;
+  assert(input2_data != NULL);
+
+  const struct xnn_value* output_value = values + output_id;
+  void* output_data = output_value->data;
+  assert(output_data != NULL);
+
+  switch (opdata->operator_objects[0]->type) {
+    case xnn_operator_type_or_nd_s32:
+      return xnn_setup_or_nd_s32(
+        opdata->operator_objects[0],
+        input1_data, input2_data, output_data);
+    default:
+      XNN_UNREACHABLE;
+  }
+}
+
+enum xnn_status xnn_define_or(
+  xnn_subgraph_t subgraph,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags)
+{
+  enum xnn_status status;
+  if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_or)) != xnn_status_success) {
+    return status;
+  }
+
+  if ((status = xnn_subgraph_check_nth_input_node_id(xnn_node_type_or, input1_id, subgraph->num_values, 1)) !=
+      xnn_status_success) {
+    return status;
+  }
+
+  const struct xnn_value* input1_value = &subgraph->values[input1_id];
+  status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_or, input1_id, input1_value, 1);
+  if (status != xnn_status_success) {
+    return status;
+  }
+
+  switch (input1_value->datatype) {
+    case xnn_datatype_int32:
+      break;
+    default:
+      xnn_log_error(
+        "failed to define %s operator with the first input ID #%" PRIu32 ": unsupported Value datatype %s (%d)",
+        xnn_node_type_to_string(xnn_node_type_or), input1_id,
+        xnn_datatype_to_string(input1_value->datatype), input1_value->datatype);
+      return xnn_status_invalid_parameter;
+  }
+
+  if ((status = xnn_subgraph_check_nth_input_node_id(
+        xnn_node_type_or, input2_id, subgraph->num_values, 2)) != xnn_status_success) {
+    return status;
+  }
+
+  const struct xnn_value* input2_value = &subgraph->values[input2_id];
+  status = xnn_subgraph_check_nth_input_type_dense(xnn_node_type_or, input2_id, input2_value, 2);
+  if (status != xnn_status_success) {
+    return status;
+  }
+
+  switch (input2_value->datatype) {
+    case xnn_datatype_int32:
+      break;
+    default:
+      xnn_log_error(
+        "failed to define %s operator with the second input ID #%" PRIu32 ": unsupported Value datatype %s (%d)",
+        xnn_node_type_to_string(xnn_node_type_or), input2_id,
+        xnn_datatype_to_string(input2_value->datatype), input2_value->datatype);
+      return xnn_status_invalid_parameter;
+  }
+
+  status = xnn_subgraph_check_output_node_id(xnn_node_type_or, output_id, subgraph->num_values);
+  if (status != xnn_status_success) {
+    return status;
+  }
+
+  const struct xnn_value* output_value = &subgraph->values[output_id];
+  status = xnn_subgraph_check_output_type_dense(xnn_node_type_or, output_id, output_value);
+  if (status != xnn_status_success) {
+    return status;
+  }
+
+  enum xnn_compute_type compute_type = xnn_compute_type_invalid;
+  switch (output_value->datatype) {
+    case xnn_datatype_int32:
+      compute_type = xnn_compute_type_s32;
+      break;
+    default:
+      xnn_log_error(
+        "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)",
+        xnn_node_type_to_string(xnn_node_type_or), output_id,
+        xnn_datatype_to_string(output_value->datatype), output_value->datatype);
+      return xnn_status_invalid_parameter;
+  }
+
+  struct xnn_node* node = xnn_subgraph_new_node(subgraph);
+  if (node == NULL) {
+    return xnn_status_out_of_memory;
+  }
+
+  node->type = xnn_node_type_or;
+  node->compute_type = compute_type;
+  node->num_inputs = 2;
+  node->inputs[0] = input1_id;
+  node->inputs[1] = input2_id;
+  node->num_outputs = 1;
+  node->outputs[0] = output_id;
+  node->flags = flags;
+
+  node->create = create_or_operator;
+  node->reshape = reshape_or_operator;
+  node->setup = setup_or_operator;
+
+  return xnn_status_success;
+}
diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h
index 3f1d2b599de..382e344b3e7 100644
--- a/src/xnnpack/config.h
+++ b/src/xnnpack/config.h
@@ -35,6 +35,7 @@ XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vsqrdiff_c
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vadd_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vcopysign_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_s32_vmul_config();
+XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_s32_vor_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vdiv_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vmax_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vmin_config();
diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h
index d11ab8e6e60..83e0879185c 100644
--- a/src/xnnpack/microfnptr.h
+++ b/src/xnnpack/microfnptr.h
@@ -1980,6 +1980,15 @@ typedef void (*xnn_f32_vneg_ukernel_fn)(
     float* output,
     const union xnn_f32_default_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
+// VOR: Vector OR elementwise
+
+typedef void (*xnn_s32_vor_ukernel_fn)(
+    size_t batch,
+    const int32_t* input_a,
+    const int32_t* input_b,
+    int32_t* output,
+    const union xnn_s32_default_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
+
 // VRELU: Vector REctified Linear Unit elementwise
 
 typedef void (*xnn_f32_vrelu_ukernel_fn)(
diff --git a/src/xnnpack/node-type.h b/src/xnnpack/node-type.h
index a0100109e8e..6401efa09f3 100644
--- a/src/xnnpack/node-type.h
+++ b/src/xnnpack/node-type.h
@@ -59,6 +59,7 @@ enum xnn_node_type {
   xnn_node_type_minimum2,
   xnn_node_type_multiply2,
   xnn_node_type_negate,
+  xnn_node_type_or,
   xnn_node_type_prelu,
   xnn_node_type_reciprocal_square_root,
   xnn_node_type_reshape_2d,
diff --git a/src/xnnpack/operator-type.h b/src/xnnpack/operator-type.h
index c0ecfdb3200..2c2c0a32adf 100644
--- a/src/xnnpack/operator-type.h
+++ b/src/xnnpack/operator-type.h
@@ -138,6 +138,7 @@ enum xnn_operator_type {
   xnn_operator_type_multiply_nd_s32,
   xnn_operator_type_negate_nc_f16,
   xnn_operator_type_negate_nc_f32,
+  xnn_operator_type_or_nd_s32,
   xnn_operator_type_prelu_nc_f16,
   xnn_operator_type_prelu_nc_f32,
   xnn_operator_type_reciprocal_square_root_nc_f16,
diff --git a/src/xnnpack/simd/s32-avx2.h b/src/xnnpack/simd/s32-avx2.h
index 7a4fdd7ff2f..24d8567ac56 100644
--- a/src/xnnpack/simd/s32-avx2.h
+++ b/src/xnnpack/simd/s32-avx2.h
@@ -43,6 +43,13 @@ static XNN_INLINE xnn_simd_s32_t xnn_min_s32(xnn_simd_s32_t a,
   return _mm256_min_epi32(a, b);
 }
 
+// Bitwise Operations
+
+static XNN_INLINE xnn_simd_s32_t xnn_or_s32(xnn_simd_s32_t a,
+                                             xnn_simd_s32_t b) {
+  return _mm256_or_si256(a, b);
+}
+
 // Load/store operations.
 
 static XNN_INLINE xnn_simd_s32_t xnn_loadu_s32(const int32_t* ptr) {
diff --git a/src/xnnpack/simd/s32-avx512f.h b/src/xnnpack/simd/s32-avx512f.h
index ec89f56cc19..5bcbaa0bf24 100644
--- a/src/xnnpack/simd/s32-avx512f.h
+++ b/src/xnnpack/simd/s32-avx512f.h
@@ -42,22 +42,29 @@ static XNN_INLINE xnn_simd_s32_t xnn_min_s32(xnn_simd_s32_t a,
   return _mm512_min_epi32(a, b);
 }
 
+// Bitwise operations
+
+static XNN_INLINE xnn_simd_s32_t xnn_or_s32(xnn_simd_s32_t a,
+                                             xnn_simd_s32_t b) {
+  return _mm512_or_epi32(a, b);
+}
+
 // Load/store operations.
 
 static XNN_INLINE xnn_simd_s32_t xnn_loadu_s32(const int32_t* ptr) {
-  return _mm512_loadu_epi32(ptr);
+  return _mm512_loadu_si512(ptr);
 }
 
 static XNN_INLINE xnn_simd_s32_t xnn_load_s32(const int32_t* ptr) {
-  return _mm512_load_epi32(ptr);
+  return _mm512_load_si512(ptr);
 }
 
 static XNN_INLINE void xnn_storeu_s32(int32_t* ptr, xnn_simd_s32_t v) {
-  _mm512_storeu_epi32(ptr, v);
+  _mm512_storeu_si512(ptr, v);
 }
 
 static XNN_INLINE void xnn_store_s32(float* ptr, xnn_simd_s32_t v) {
-  _mm512_store_epi32(ptr, v);
+  _mm512_store_si512(ptr, v);
 }
 
 static XNN_INLINE xnn_simd_s32_t xnn_set1_s32(int32_t v) {
diff --git a/src/xnnpack/simd/s32-neon.h b/src/xnnpack/simd/s32-neon.h
index 62aa2b70f50..b4354c8b634 100644
--- a/src/xnnpack/simd/s32-neon.h
+++ b/src/xnnpack/simd/s32-neon.h
@@ -38,6 +38,13 @@ static XNN_INLINE xnn_simd_s32_t xnn_min_s32(xnn_simd_s32_t a,
   return vminq_s32(a, b);
 }
 
+// Bitwsie operations
+
+static XNN_INLINE xnn_simd_s32_t xnn_or_s32(xnn_simd_s32_t a,
+                                             xnn_simd_s32_t b) {
+  return vorrq_s32(a, b);
+}
+
 // Load/store operations.
 static XNN_INLINE xnn_simd_s32_t xnn_loadu_s32(const int32_t* ptr) {
   return vld1q_s32(ptr);
diff --git a/src/xnnpack/simd/s32-scalar.h b/src/xnnpack/simd/s32-scalar.h
index 55c1fc53d12..aa625135796 100644
--- a/src/xnnpack/simd/s32-scalar.h
+++ b/src/xnnpack/simd/s32-scalar.h
@@ -39,6 +39,13 @@ static XNN_INLINE xnn_simd_s32_t xnn_min_s32(xnn_simd_s32_t a,
   return (a < b) ? a : b;
 }
 
+// Bitwise operations
+
+static XNN_INLINE xnn_simd_s32_t xnn_or_s32(xnn_simd_s32_t a,
+                                             xnn_simd_s32_t b) {
+  return (a | b);
+}
+
 static XNN_INLINE xnn_simd_s32_t xnn_loadu_s32(const int32_t *ptr) { return *ptr; }
 
 static XNN_INLINE xnn_simd_s32_t xnn_load_s32(const int32_t *ptr) { return *ptr; }
diff --git a/src/xnnpack/simd/s32-sse41.h b/src/xnnpack/simd/s32-sse41.h
index c453f097506..ea03b638783 100644
--- a/src/xnnpack/simd/s32-sse41.h
+++ b/src/xnnpack/simd/s32-sse41.h
@@ -41,6 +41,13 @@ static XNN_INLINE xnn_simd_s32_t xnn_min_s32(xnn_simd_s32_t a,
   return _mm_min_epi32(a, b);
 }
 
+// Bitwise operations
+
+static XNN_INLINE xnn_simd_s32_t xnn_or_s32(xnn_simd_s32_t a,
+                                             xnn_simd_s32_t b) {
+  return _mm_or_si128(a, b);
+}
+
 // Load/store operations.
 
 static XNN_INLINE xnn_simd_s32_t xnn_loadu_s32(const int32_t* ptr) {
diff --git a/src/xnnpack/simd/s32-wasmsimd.h b/src/xnnpack/simd/s32-wasmsimd.h
index 96e2c836252..141d952487a 100644
--- a/src/xnnpack/simd/s32-wasmsimd.h
+++ b/src/xnnpack/simd/s32-wasmsimd.h
@@ -41,6 +41,13 @@ static XNN_INLINE xnn_simd_s32_t xnn_min_s32(xnn_simd_s32_t a,
   return wasm_i32x4_min(a, b);
 }
 
+// Bitwise operations
+
+static XNN_INLINE xnn_simd_s32_t xnn_or_s32(xnn_simd_s32_t a,
+                                             xnn_simd_s32_t b) {
+  return wasm_v128_or(a, b);
+}
+
 // Load/store operations.
 
 static XNN_INLINE xnn_simd_s32_t xnn_loadu_s32(const int32_t* ptr) {
diff --git a/src/xnnpack/vbinary.h b/src/xnnpack/vbinary.h
index 8b11dd26dc7..b8b0ce4d43f 100644
--- a/src/xnnpack/vbinary.h
+++ b/src/xnnpack/vbinary.h
@@ -1339,6 +1339,56 @@ DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vmulc_ukernel__wasmsimd_u8)
 DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vmulc_ukernel__wasmsimd_u12)
 DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vmulc_ukernel__wasmsimd_u16)
 
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx2_u8)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx2_u16)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx2_u24)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx2_u32)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx512f_u16)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx512f_u32)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx512f_u48)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__avx512f_u64)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__neon_u4)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__neon_u8)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__neon_u12)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__neon_u16)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__scalar_u1)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__scalar_u2)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__scalar_u4)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__scalar_u8)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__sse41_u4)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__sse41_u8)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__sse41_u12)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__sse41_u16)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__wasmsimd_u4)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__wasmsimd_u8)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__wasmsimd_u12)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vor_ukernel__wasmsimd_u16)
+
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx2_u8)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx2_u16)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx2_u24)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx2_u32)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx512f_u16)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx512f_u32)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx512f_u48)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__avx512f_u64)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__neon_u4)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__neon_u8)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__neon_u12)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__neon_u16)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__scalar_u1)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__scalar_u2)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__scalar_u4)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__scalar_u8)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__sse41_u4)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__sse41_u8)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__sse41_u12)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__sse41_u16)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__wasmsimd_u4)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__wasmsimd_u8)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__wasmsimd_u12)
+DECLARE_S32_VBINOP_UKERNEL_FUNCTION(xnn_s32_vorc_ukernel__wasmsimd_u16)
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/test/BUILD.bazel b/test/BUILD.bazel
index c7d5d3afab6..f8e6a59b935 100644
--- a/test/BUILD.bazel
+++ b/test/BUILD.bazel
@@ -291,6 +291,7 @@ xnnpack_cc_library(
     "qu8_vmul_minmax_fp32",
     "qu8_vmul_minmax_rndnu",
     "s32_vmul",
+    "s32_vor",
 ]]
 
 [xnnpack_unit_test(
@@ -339,6 +340,7 @@ xnnpack_cc_library(
     "qu8_vmulc_minmax_fp32",
     "qu8_vmulc_minmax_rndnu",
     "s32_vmulc",
+    "s32_vorc",
 ]]
 
 [xnnpack_unit_test(
@@ -1584,6 +1586,7 @@ xnnpack_binary(
     "squared_difference_nd_eager",
     "subtract_nd",
     "subtract_nd_eager",
+    "or_nd"
 ]]
 
 xnnpack_unit_test(
@@ -2092,6 +2095,7 @@ xnnpack_unit_test(
     "maximum2",
     "minimum2",
     "multiply2",
+    "or",
     "squared_difference",
     "subtract2",
 ]]
diff --git a/test/binary-elementwise-operator-tester.cc b/test/binary-elementwise-operator-tester.cc
index b19a07c781b..a2eb0a39e02 100644
--- a/test/binary-elementwise-operator-tester.cc
+++ b/test/binary-elementwise-operator-tester.cc
@@ -809,6 +809,10 @@ void BinaryElementwiseOperatorTester::TestS32() const {
         ASSERT_EQ(xnn_status_success,
                   xnn_create_multiply_nd_s32(0, &binary_elementwise_op));
         break;
+      case OperationType::OR:
+        ASSERT_EQ(xnn_status_success,
+                  xnn_create_or_nd_s32(0, &binary_elementwise_op));
+        break;
       default:
         FAIL() << "Unsupported operation type";
     }
@@ -830,6 +834,17 @@ void BinaryElementwiseOperatorTester::TestS32() const {
                                           binary_elementwise_op, input1.data(),
                                           input2.data(), output.data()));
         break;
+      case OperationType::OR:
+        ASSERT_EQ(
+            xnn_status_success,
+            xnn_reshape_or_nd_s32(
+                binary_elementwise_op, num_input1_dims(), input1_shape().data(),
+                num_input2_dims(), input2_shape().data(),
+                /*threadpool=*/nullptr));
+        ASSERT_EQ(xnn_status_success, xnn_setup_or_nd_s32(
+                                          binary_elementwise_op, input1.data(),
+                                          input2.data(), output.data()));
+        break;
       default:
         FAIL() << "Unsupported operation type";
     }
diff --git a/test/binary-elementwise-operator-tester.h b/test/binary-elementwise-operator-tester.h
index 6b46730e79b..df93710f4a3 100644
--- a/test/binary-elementwise-operator-tester.h
+++ b/test/binary-elementwise-operator-tester.h
@@ -30,6 +30,7 @@ class BinaryElementwiseOperatorTester {
     Maximum,
     Minimum,
     Multiply,
+    OR,
     Subtract,
     SquaredDifference,
   };
@@ -194,6 +195,8 @@ class BinaryElementwiseOperatorTester {
         return std::min<int32_t>(a, b);
       case OperationType::Multiply:
         return a * b;
+      case OperationType::OR:
+        return a | b;
       case OperationType::Subtract:
         return a - b;
       case OperationType::SquaredDifference:
diff --git a/test/or-nd.cc b/test/or-nd.cc
new file mode 100644
index 00000000000..bc8be75b35e
--- /dev/null
+++ b/test/or-nd.cc
@@ -0,0 +1,1157 @@
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#include "binary-elementwise-operator-tester.h"
+#include <gtest/gtest.h>
+
+constexpr size_t kDim1 = 2;
+constexpr size_t kDim2 = 3;
+constexpr size_t kDim3 = 4;
+constexpr size_t kDim4 = 5;
+constexpr size_t kDim5 = 6;
+constexpr size_t kDim6 = 7;
+
+
+TEST(OR_ND_S32, or_0d_x_0d) {
+  BinaryElementwiseOperatorTester()
+    .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+    .TestS32();
+}
+
+TEST(OR_ND_S32, or_1d_x_0d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) {
+    const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+    const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+    BinaryElementwiseOperatorTester()
+      .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+      .input1_shape({input1_dim1})
+      .TestS32();
+  }
+}
+
+TEST(OR_ND_S32, or_0d_x_1d) {
+  for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) {
+    const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+    const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+    BinaryElementwiseOperatorTester()
+      .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+      .input2_shape({input2_dim1})
+      .TestS32();
+  }
+}
+
+TEST(OR_ND_S32, or_1d_x_1d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim1})
+        .input2_shape({input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_0d_x_2d) {
+  for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) {
+    const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+    const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+    const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+    const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+    BinaryElementwiseOperatorTester()
+      .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+      .input2_shape({input2_dim2, input2_dim1})
+      .TestS32();
+  }
+}
+
+TEST(OR_ND_S32, or_1d_x_2d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim1})
+        .input2_shape({input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_2d_x_0d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) {
+    const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+    const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+    const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+    const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+    BinaryElementwiseOperatorTester()
+      .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+      .input1_shape({input1_dim2, input1_dim1})
+      .TestS32();
+  }
+}
+
+TEST(OR_ND_S32, or_2d_x_1d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim2, input1_dim1})
+        .input2_shape({input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_2d_x_2d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim2, input1_dim1})
+        .input2_shape({input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_0d_x_3d) {
+  for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) {
+    const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+    const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+    const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+    const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+    const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+    const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+    BinaryElementwiseOperatorTester()
+      .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+      .input2_shape({input2_dim3, input2_dim2, input2_dim1})
+      .TestS32();
+  }
+}
+
+TEST(OR_ND_S32, or_1d_x_3d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim1})
+        .input2_shape({input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_2d_x_3d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim2, input1_dim1})
+        .input2_shape({input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_3d_x_0d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) {
+    const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+    const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+    const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+    const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+    const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+    const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+    BinaryElementwiseOperatorTester()
+      .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+      .input1_shape({input1_dim3, input1_dim2, input1_dim1})
+      .TestS32();
+  }
+}
+
+TEST(OR_ND_S32, or_3d_x_1d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_3d_x_2d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_3d_x_3d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_0d_x_4d) {
+  for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) {
+    const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+    const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+    const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+    const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+    const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+    const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+    const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+    const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+    BinaryElementwiseOperatorTester()
+      .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+      .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+      .TestS32();
+  }
+}
+
+TEST(OR_ND_S32, or_1d_x_4d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim1})
+        .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_2d_x_4d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim2, input1_dim1})
+        .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_3d_x_4d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_4d_x_0d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) {
+    const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+    const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+    const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+    const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+    const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+    const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+    const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+    const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+    BinaryElementwiseOperatorTester()
+      .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+      .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+      .TestS32();
+  }
+}
+
+TEST(OR_ND_S32, or_4d_x_1d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_4d_x_2d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_4d_x_3d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_4d_x_4d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_0d_x_5d) {
+  for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) {
+    const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+    const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+    const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+    const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+    const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+    const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+    const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+    const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+    const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+    const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+    BinaryElementwiseOperatorTester()
+      .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+      .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+      .TestS32();
+  }
+}
+
+TEST(OR_ND_S32, or_1d_x_5d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim1})
+        .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_2d_x_5d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim2, input1_dim1})
+        .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_3d_x_5d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_4d_x_5d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_5d_x_0d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) {
+    const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+    const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+    const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+    const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+    const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+    const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+    const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+    const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+    const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+    const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+    BinaryElementwiseOperatorTester()
+      .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+      .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+      .TestS32();
+  }
+}
+
+TEST(OR_ND_S32, or_5d_x_1d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_5d_x_2d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_5d_x_3d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_5d_x_4d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_5d_x_5d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .iterations(1)
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_0d_x_6d) {
+  for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) {
+    const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+    const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+    const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+    const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+    const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+    const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0;
+    const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+    const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+    const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+    const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+    const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+    const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+    BinaryElementwiseOperatorTester()
+      .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+      .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+      .TestS32();
+  }
+}
+
+TEST(OR_ND_S32, or_1d_x_6d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 1); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+      const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim1})
+        .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_2d_x_6d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 2); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+      const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim2, input1_dim1})
+        .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_3d_x_6d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 3); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+      const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_4d_x_6d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 4); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+      const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_5d_x_6d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 5); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+      const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .iterations(1)
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_6d_x_0d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) {
+    const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+    const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+    const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+    const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+    const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+    const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0;
+    const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+    const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+    const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+    const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+    const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+    const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+    BinaryElementwiseOperatorTester()
+      .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+      .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+      .TestS32();
+  }
+}
+
+TEST(OR_ND_S32, or_6d_x_1d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 1); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+      const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_6d_x_2d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 2); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+      const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_6d_x_3d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 3); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+      const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_6d_x_4d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 4); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+      const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_6d_x_5d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 5); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+      const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .iterations(1)
+        .TestS32();
+    }
+  }
+}
+
+TEST(OR_ND_S32, or_6d_x_6d) {
+  for (uint32_t bm1 = 0; bm1 < (uint32_t(1) << 6); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (uint32_t(1) << 6); bm2++) {
+      const bool input1_broadcast_dim1 = (bm1 & (uint32_t(1) << 0)) != 0;
+      const bool input1_broadcast_dim2 = (bm1 & (uint32_t(1) << 1)) != 0;
+      const bool input1_broadcast_dim3 = (bm1 & (uint32_t(1) << 2)) != 0;
+      const bool input1_broadcast_dim4 = (bm1 & (uint32_t(1) << 3)) != 0;
+      const bool input1_broadcast_dim5 = (bm1 & (uint32_t(1) << 4)) != 0;
+      const bool input1_broadcast_dim6 = (bm1 & (uint32_t(1) << 5)) != 0;
+      const bool input2_broadcast_dim1 = (bm2 & (uint32_t(1) << 0)) != 0;
+      const bool input2_broadcast_dim2 = (bm2 & (uint32_t(1) << 1)) != 0;
+      const bool input2_broadcast_dim3 = (bm2 & (uint32_t(1) << 2)) != 0;
+      const bool input2_broadcast_dim4 = (bm2 & (uint32_t(1) << 3)) != 0;
+      const bool input2_broadcast_dim5 = (bm2 & (uint32_t(1) << 4)) != 0;
+      const bool input2_broadcast_dim6 = (bm2 & (uint32_t(1) << 5)) != 0;
+      const size_t input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const size_t input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const size_t input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const size_t input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const size_t input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const size_t input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+      const size_t input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const size_t input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const size_t input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const size_t input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const size_t input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      const size_t input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+      BinaryElementwiseOperatorTester()
+        .operation_type(BinaryElementwiseOperatorTester::OperationType::OR)
+        .input1_shape({input1_dim6, input1_dim5, input1_dim4, input1_dim3, input1_dim2, input1_dim1})
+        .input2_shape({input2_dim6, input2_dim5, input2_dim4, input2_dim3, input2_dim2, input2_dim1})
+        .iterations(1)
+        .TestS32();
+    }
+  }
+}
diff --git a/test/or.cc b/test/or.cc
new file mode 100644
index 00000000000..94057777969
--- /dev/null
+++ b/test/or.cc
@@ -0,0 +1,138 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <fp16/fp16.h>
+#include "xnnpack.h"
+#include "xnnpack/node-type.h"
+#include "xnnpack/operator.h"
+#include "xnnpack/subgraph.h"
+#include "subgraph-binary-tester.h"
+
+using OrS32 = BinaryTest<int32_t>;
+
+TEST_F(OrS32, define) {
+  ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
+
+  xnn_subgraph_t subgraph = nullptr;
+  ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph));
+  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(subgraph, xnn_delete_subgraph);
+
+  std::vector<size_t> dims = RandomShape();
+
+  uint32_t input1_id = XNN_INVALID_NODE_ID;
+  ASSERT_EQ(
+    xnn_status_success, xnn_define_tensor_value(
+                          subgraph, xnn_datatype_int32, dims.size(), dims.data(), nullptr,
+                          /*external_id=*/0, /*flags=*/0, &input1_id));
+  ASSERT_NE(input1_id, XNN_INVALID_NODE_ID);
+
+  uint32_t input2_id = XNN_INVALID_NODE_ID;
+  ASSERT_EQ(
+    xnn_status_success, xnn_define_tensor_value(
+                          subgraph, xnn_datatype_int32, dims.size(), dims.data(), nullptr,
+                          /*external_id=*/0, /*flags=*/0, &input2_id));
+  ASSERT_NE(input2_id, XNN_INVALID_NODE_ID);
+
+  uint32_t output_id = XNN_INVALID_NODE_ID;
+  ASSERT_EQ(
+    xnn_status_success,
+    xnn_define_tensor_value(
+      subgraph, xnn_datatype_int32, dims.size(), dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &output_id));
+  ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
+
+  ASSERT_EQ(
+    xnn_status_success,
+    xnn_define_or(subgraph, input1_id, input2_id, output_id, /*flags=*/0));
+
+  ASSERT_EQ(subgraph->num_nodes, 1);
+  const struct xnn_node* node = &subgraph->nodes[0];
+  ASSERT_EQ(node->type, xnn_node_type_or);
+  ASSERT_EQ(node->compute_type, xnn_compute_type_s32);
+  ASSERT_EQ(node->num_inputs, 2);
+  ASSERT_EQ(node->inputs[0], input1_id);
+  ASSERT_EQ(node->inputs[1], input2_id);
+  ASSERT_EQ(node->num_outputs, 1);
+  ASSERT_EQ(node->outputs[0], output_id);
+  ASSERT_EQ(node->flags, 0);
+}
+
+TEST_F(OrS32, matches_operator_api)
+{
+  std::generate(input1.begin(), input1.end(), [&]() { return s32dist(rng); });
+  std::generate(input2.begin(), input2.end(), [&]() { return s32dist(rng); });
+  std::fill(operator_output.begin(), operator_output.end(), INT_MAX);
+  std::fill(subgraph_output.begin(), subgraph_output.end(), INT_MAX);
+
+  ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
+
+  xnn_operator_t op = nullptr;
+
+  // Call operator API.
+  ASSERT_EQ(xnn_status_success, xnn_create_or_nd_s32(/*flags=*/0, &op));
+  std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_op(op, xnn_delete_operator);
+
+  ASSERT_EQ(
+    xnn_status_success, xnn_reshape_or_nd_s32(
+                          op, input1_dims.size(), input1_dims.data(), input2_dims.size(), input2_dims.data(),
+                          /*threadpool=*/nullptr));
+
+  ASSERT_EQ(
+    xnn_status_success, xnn_setup_or_nd_s32(op, input1.data(), input2.data(), operator_output.data()));
+
+  ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr));
+
+  // Call subgraph API.
+  xnn_subgraph_t subgraph = nullptr;
+  ASSERT_EQ(xnn_status_success, xnn_create_subgraph(3, /*flags=*/0, &subgraph));
+  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(subgraph, xnn_delete_subgraph);
+
+  uint32_t input1_id = XNN_INVALID_NODE_ID;
+  ASSERT_EQ(
+    xnn_status_success, xnn_define_tensor_value(
+                          subgraph, xnn_datatype_int32, input1_dims.size(), input1_dims.data(), nullptr,
+                          /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input1_id));
+  ASSERT_NE(input1_id, XNN_INVALID_NODE_ID);
+
+  uint32_t input2_id = XNN_INVALID_NODE_ID;
+  ASSERT_EQ(
+    xnn_status_success, xnn_define_tensor_value(
+                          subgraph, xnn_datatype_int32, input2_dims.size(), input2_dims.data(), nullptr,
+                          /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input2_id));
+  ASSERT_NE(input2_id, XNN_INVALID_NODE_ID);
+
+  uint32_t output_id = XNN_INVALID_NODE_ID;
+  ASSERT_EQ(
+    xnn_status_success,
+    xnn_define_tensor_value(
+      subgraph, xnn_datatype_int32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2,
+      /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
+  ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
+
+  ASSERT_EQ(
+    xnn_status_success,
+    xnn_define_or(subgraph, input1_id, input2_id, output_id, /*flags=*/0));
+
+  xnn_runtime_t runtime = nullptr;
+  ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime));
+  ASSERT_NE(nullptr, runtime);
+  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(runtime, xnn_delete_runtime);
+  std::array<xnn_external_value, 3> external = {
+    xnn_external_value{input1_id, input1.data()}, xnn_external_value{input2_id, input2.data()},
+    xnn_external_value{output_id, subgraph_output.data()}};
+  ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data()));
+  ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime));
+
+  ASSERT_EQ(subgraph_output, operator_output);
+}
diff --git a/test/s32-vor.cc b/test/s32-vor.cc
new file mode 100644
index 00000000000..33533ba9619
--- /dev/null
+++ b/test/s32-vor.cc
@@ -0,0 +1,1568 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Specification: test/s32-vor.yaml
+//   Generator: tools/generate-vbinary-test.py
+
+
+#include <gtest/gtest.h>
+#include "xnnpack/common.h"
+#include "xnnpack/isa-checks.h"
+#include "xnnpack/microparams-init.h"
+#include "xnnpack/vbinary.h"
+#include "vbinary-microkernel-tester.h"
+
+
+TEST(S32_VOR__SCALAR_U1, batch_eq_1) {
+  VBinaryMicrokernelTester()
+    .batch_size(1)
+    .Test(xnn_s32_vor_ukernel__scalar_u1, VBinaryMicrokernelTester::OpType::OR);
+}
+
+TEST(S32_VOR__SCALAR_U1, batch_gt_1) {
+  for (size_t batch_size = 2; batch_size < 10; batch_size++) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vor_ukernel__scalar_u1, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U1, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_s32_vor_ukernel__scalar_u1, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U1, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_s32_vor_ukernel__scalar_u1, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U1, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_s32_vor_ukernel__scalar_u1, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+
+TEST(S32_VOR__SCALAR_U2, batch_eq_2) {
+  VBinaryMicrokernelTester()
+    .batch_size(2)
+    .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR);
+}
+
+TEST(S32_VOR__SCALAR_U2, batch_div_2) {
+  for (size_t batch_size = 4; batch_size < 20; batch_size += 2) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U2, batch_lt_2) {
+  for (size_t batch_size = 1; batch_size < 2; batch_size++) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U2, batch_gt_2) {
+  for (size_t batch_size = 3; batch_size < 4; batch_size++) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U2, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U2, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U2, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_s32_vor_ukernel__scalar_u2, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+
+TEST(S32_VOR__SCALAR_U4, batch_eq_4) {
+  VBinaryMicrokernelTester()
+    .batch_size(4)
+    .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR);
+}
+
+TEST(S32_VOR__SCALAR_U4, batch_div_4) {
+  for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U4, batch_lt_4) {
+  for (size_t batch_size = 1; batch_size < 4; batch_size++) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U4, batch_gt_4) {
+  for (size_t batch_size = 5; batch_size < 8; batch_size++) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U4, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U4, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U4, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_s32_vor_ukernel__scalar_u4, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+
+TEST(S32_VOR__SCALAR_U8, batch_eq_8) {
+  VBinaryMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR);
+}
+
+TEST(S32_VOR__SCALAR_U8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+TEST(S32_VOR__SCALAR_U8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinaryMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_s32_vor_ukernel__scalar_u8, VBinaryMicrokernelTester::OpType::OR);
+  }
+}
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VOR__SSE41_U4, batch_eq_4) {
+    TEST_REQUIRES_X86_SSE41;
+    VBinaryMicrokernelTester()
+      .batch_size(4)
+      .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__SSE41_U4, batch_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U4, batch_lt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size < 4; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U4, batch_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 5; batch_size < 8; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U4, inplace_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U4, inplace_b) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U4, inplace_a_and_b) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__sse41_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VOR__SSE41_U8, batch_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    VBinaryMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__SSE41_U8, batch_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U8, batch_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U8, batch_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U8, inplace_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U8, inplace_b) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U8, inplace_a_and_b) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__sse41_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VOR__SSE41_U12, batch_eq_12) {
+    TEST_REQUIRES_X86_SSE41;
+    VBinaryMicrokernelTester()
+      .batch_size(12)
+      .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__SSE41_U12, batch_div_12) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 24; batch_size < 120; batch_size += 12) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U12, batch_lt_12) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size < 12; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U12, batch_gt_12) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 13; batch_size < 24; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U12, inplace_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U12, inplace_b) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U12, inplace_a_and_b) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__sse41_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VOR__SSE41_U16, batch_eq_16) {
+    TEST_REQUIRES_X86_SSE41;
+    VBinaryMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__SSE41_U16, batch_div_16) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U16, batch_lt_16) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U16, batch_gt_16) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U16, inplace_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U16, inplace_b) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__SSE41_U16, inplace_a_and_b) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__sse41_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VOR__AVX2_U8, batch_eq_8) {
+    TEST_REQUIRES_X86_AVX2;
+    VBinaryMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__AVX2_U8, batch_div_8) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U8, batch_lt_8) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U8, batch_gt_8) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U8, inplace_a) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U8, inplace_b) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U8, inplace_a_and_b) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx2_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VOR__AVX2_U16, batch_eq_16) {
+    TEST_REQUIRES_X86_AVX2;
+    VBinaryMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__AVX2_U16, batch_div_16) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U16, batch_lt_16) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U16, batch_gt_16) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U16, inplace_a) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U16, inplace_b) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U16, inplace_a_and_b) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx2_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VOR__AVX2_U24, batch_eq_24) {
+    TEST_REQUIRES_X86_AVX2;
+    VBinaryMicrokernelTester()
+      .batch_size(24)
+      .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__AVX2_U24, batch_div_24) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U24, batch_lt_24) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U24, batch_gt_24) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U24, inplace_a) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U24, inplace_b) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U24, inplace_a_and_b) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx2_u24, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VOR__AVX2_U32, batch_eq_32) {
+    TEST_REQUIRES_X86_AVX2;
+    VBinaryMicrokernelTester()
+      .batch_size(32)
+      .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__AVX2_U32, batch_div_32) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U32, batch_lt_32) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U32, batch_gt_32) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U32, inplace_a) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U32, inplace_b) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX2_U32, inplace_a_and_b) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx2_u32, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VOR__AVX512F_U16, batch_eq_16) {
+    TEST_REQUIRES_X86_AVX512F;
+    VBinaryMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__AVX512F_U16, batch_div_16) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U16, batch_lt_16) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U16, batch_gt_16) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U16, inplace_a) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U16, inplace_b) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U16, inplace_a_and_b) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx512f_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VOR__AVX512F_U32, batch_eq_32) {
+    TEST_REQUIRES_X86_AVX512F;
+    VBinaryMicrokernelTester()
+      .batch_size(32)
+      .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__AVX512F_U32, batch_div_32) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U32, batch_lt_32) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U32, batch_gt_32) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U32, inplace_a) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U32, inplace_b) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U32, inplace_a_and_b) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx512f_u32, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VOR__AVX512F_U48, batch_eq_48) {
+    TEST_REQUIRES_X86_AVX512F;
+    VBinaryMicrokernelTester()
+      .batch_size(48)
+      .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__AVX512F_U48, batch_div_48) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 96; batch_size < 480; batch_size += 48) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U48, batch_lt_48) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size < 48; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U48, batch_gt_48) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 49; batch_size < 96; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U48, inplace_a) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 240; batch_size += 47) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U48, inplace_b) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 240; batch_size += 47) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U48, inplace_a_and_b) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 240; batch_size += 47) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx512f_u48, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VOR__AVX512F_U64, batch_eq_64) {
+    TEST_REQUIRES_X86_AVX512F;
+    VBinaryMicrokernelTester()
+      .batch_size(64)
+      .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__AVX512F_U64, batch_div_64) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 128; batch_size < 640; batch_size += 64) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U64, batch_lt_64) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size < 64; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U64, batch_gt_64) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 65; batch_size < 128; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U64, inplace_a) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 320; batch_size += 63) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U64, inplace_b) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 320; batch_size += 63) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__AVX512F_U64, inplace_a_and_b) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 320; batch_size += 63) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__avx512f_u64, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  TEST(S32_VOR__WASMSIMD_U4, batch_eq_4) {
+    VBinaryMicrokernelTester()
+      .batch_size(4)
+      .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__WASMSIMD_U4, batch_div_4) {
+    for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U4, batch_lt_4) {
+    for (size_t batch_size = 1; batch_size < 4; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U4, batch_gt_4) {
+    for (size_t batch_size = 5; batch_size < 8; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U4, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U4, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U4, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  TEST(S32_VOR__WASMSIMD_U8, batch_eq_8) {
+    VBinaryMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__WASMSIMD_U8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U8, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U8, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U8, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  TEST(S32_VOR__WASMSIMD_U12, batch_eq_12) {
+    VBinaryMicrokernelTester()
+      .batch_size(12)
+      .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__WASMSIMD_U12, batch_div_12) {
+    for (size_t batch_size = 24; batch_size < 120; batch_size += 12) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U12, batch_lt_12) {
+    for (size_t batch_size = 1; batch_size < 12; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U12, batch_gt_12) {
+    for (size_t batch_size = 13; batch_size < 24; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U12, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U12, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U12, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  TEST(S32_VOR__WASMSIMD_U16, batch_eq_16) {
+    VBinaryMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__WASMSIMD_U16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__WASMSIMD_U16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__wasmsimd_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(S32_VOR__NEON_U4, batch_eq_4) {
+    TEST_REQUIRES_ARM_NEON;
+    VBinaryMicrokernelTester()
+      .batch_size(4)
+      .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__NEON_U4, batch_div_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U4, batch_lt_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size < 4; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U4, batch_gt_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 5; batch_size < 8; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U4, inplace_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U4, inplace_b) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U4, inplace_a_and_b) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__neon_u4, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(S32_VOR__NEON_U8, batch_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    VBinaryMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__NEON_U8, batch_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U8, batch_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U8, batch_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U8, inplace_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U8, inplace_b) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U8, inplace_a_and_b) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__neon_u8, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(S32_VOR__NEON_U12, batch_eq_12) {
+    TEST_REQUIRES_ARM_NEON;
+    VBinaryMicrokernelTester()
+      .batch_size(12)
+      .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__NEON_U12, batch_div_12) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 24; batch_size < 120; batch_size += 12) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U12, batch_lt_12) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size < 12; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U12, batch_gt_12) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 13; batch_size < 24; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U12, inplace_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U12, inplace_b) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U12, inplace_a_and_b) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__neon_u12, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(S32_VOR__NEON_U16, batch_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    VBinaryMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR);
+  }
+
+  TEST(S32_VOR__NEON_U16, batch_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U16, batch_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U16, batch_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U16, inplace_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U16, inplace_b) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+
+  TEST(S32_VOR__NEON_U16, inplace_a_and_b) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_s32_vor_ukernel__neon_u16, VBinaryMicrokernelTester::OpType::OR);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/s32-vor.yaml b/test/s32-vor.yaml
new file mode 100644
index 00000000000..a31e3d719a5
--- /dev/null
+++ b/test/s32-vor.yaml
@@ -0,0 +1,40 @@
+# Copyright 2024 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Scalar
+- name: xnn_s32_vor_ukernel__scalar_u1
+- name: xnn_s32_vor_ukernel__scalar_u2
+- name: xnn_s32_vor_ukernel__scalar_u4
+- name: xnn_s32_vor_ukernel__scalar_u8
+
+# x86 SSE41
+- name: xnn_s32_vor_ukernel__sse41_u4
+- name: xnn_s32_vor_ukernel__sse41_u8
+- name: xnn_s32_vor_ukernel__sse41_u12
+- name: xnn_s32_vor_ukernel__sse41_u16
+
+# x86 AVX2
+- name: xnn_s32_vor_ukernel__avx2_u8
+- name: xnn_s32_vor_ukernel__avx2_u16
+- name: xnn_s32_vor_ukernel__avx2_u24
+- name: xnn_s32_vor_ukernel__avx2_u32
+
+# x86 AVX512F
+- name: xnn_s32_vor_ukernel__avx512f_u16
+- name: xnn_s32_vor_ukernel__avx512f_u32
+- name: xnn_s32_vor_ukernel__avx512f_u48
+- name: xnn_s32_vor_ukernel__avx512f_u64
+
+# Wasm SIMD
+- name: xnn_s32_vor_ukernel__wasmsimd_u4
+- name: xnn_s32_vor_ukernel__wasmsimd_u8
+- name: xnn_s32_vor_ukernel__wasmsimd_u12
+- name: xnn_s32_vor_ukernel__wasmsimd_u16
+
+# ARM NEON
+- name: xnn_s32_vor_ukernel__neon_u4
+- name: xnn_s32_vor_ukernel__neon_u8
+- name: xnn_s32_vor_ukernel__neon_u12
+- name: xnn_s32_vor_ukernel__neon_u16
diff --git a/test/s32-vorc.cc b/test/s32-vorc.cc
new file mode 100644
index 00000000000..87e01d4ded5
--- /dev/null
+++ b/test/s32-vorc.cc
@@ -0,0 +1,1080 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Specification: test/s32-vorc.yaml
+//   Generator: tools/generate-vbinary-test.py
+
+
+#include <gtest/gtest.h>
+#include "xnnpack/common.h"
+#include "xnnpack/isa-checks.h"
+#include "xnnpack/microparams-init.h"
+#include "xnnpack/vbinary.h"
+#include "vbinaryc-microkernel-tester.h"
+
+
+TEST(S32_VORC__SCALAR_U1, batch_eq_1) {
+  VBinaryCMicrokernelTester()
+    .batch_size(1)
+    .Test(xnn_s32_vorc_ukernel__scalar_u1, VBinaryCMicrokernelTester::OpType::ORC);
+}
+
+TEST(S32_VORC__SCALAR_U1, batch_gt_1) {
+  for (size_t batch_size = 2; batch_size < 10; batch_size++) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vorc_ukernel__scalar_u1, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+TEST(S32_VORC__SCALAR_U1, inplace) {
+  for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_s32_vorc_ukernel__scalar_u1, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+
+TEST(S32_VORC__SCALAR_U2, batch_eq_2) {
+  VBinaryCMicrokernelTester()
+    .batch_size(2)
+    .Test(xnn_s32_vorc_ukernel__scalar_u2, VBinaryCMicrokernelTester::OpType::ORC);
+}
+
+TEST(S32_VORC__SCALAR_U2, batch_div_2) {
+  for (size_t batch_size = 4; batch_size < 20; batch_size += 2) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vorc_ukernel__scalar_u2, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+TEST(S32_VORC__SCALAR_U2, batch_lt_2) {
+  for (size_t batch_size = 1; batch_size < 2; batch_size++) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vorc_ukernel__scalar_u2, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+TEST(S32_VORC__SCALAR_U2, batch_gt_2) {
+  for (size_t batch_size = 3; batch_size < 4; batch_size++) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vorc_ukernel__scalar_u2, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+TEST(S32_VORC__SCALAR_U2, inplace) {
+  for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_s32_vorc_ukernel__scalar_u2, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+
+TEST(S32_VORC__SCALAR_U4, batch_eq_4) {
+  VBinaryCMicrokernelTester()
+    .batch_size(4)
+    .Test(xnn_s32_vorc_ukernel__scalar_u4, VBinaryCMicrokernelTester::OpType::ORC);
+}
+
+TEST(S32_VORC__SCALAR_U4, batch_div_4) {
+  for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vorc_ukernel__scalar_u4, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+TEST(S32_VORC__SCALAR_U4, batch_lt_4) {
+  for (size_t batch_size = 1; batch_size < 4; batch_size++) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vorc_ukernel__scalar_u4, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+TEST(S32_VORC__SCALAR_U4, batch_gt_4) {
+  for (size_t batch_size = 5; batch_size < 8; batch_size++) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vorc_ukernel__scalar_u4, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+TEST(S32_VORC__SCALAR_U4, inplace) {
+  for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_s32_vorc_ukernel__scalar_u4, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+
+TEST(S32_VORC__SCALAR_U8, batch_eq_8) {
+  VBinaryCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_s32_vorc_ukernel__scalar_u8, VBinaryCMicrokernelTester::OpType::ORC);
+}
+
+TEST(S32_VORC__SCALAR_U8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vorc_ukernel__scalar_u8, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+TEST(S32_VORC__SCALAR_U8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vorc_ukernel__scalar_u8, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+TEST(S32_VORC__SCALAR_U8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_s32_vorc_ukernel__scalar_u8, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+TEST(S32_VORC__SCALAR_U8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinaryCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_s32_vorc_ukernel__scalar_u8, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+}
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VORC__SSE41_U4, batch_eq_4) {
+    TEST_REQUIRES_X86_SSE41;
+    VBinaryCMicrokernelTester()
+      .batch_size(4)
+      .Test(xnn_s32_vorc_ukernel__sse41_u4, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__SSE41_U4, batch_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__sse41_u4, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__SSE41_U4, batch_lt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size < 4; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__sse41_u4, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__SSE41_U4, batch_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 5; batch_size < 8; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__sse41_u4, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__SSE41_U4, inplace) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__sse41_u4, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VORC__SSE41_U8, batch_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    VBinaryCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_s32_vorc_ukernel__sse41_u8, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__SSE41_U8, batch_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__sse41_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__SSE41_U8, batch_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__sse41_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__SSE41_U8, batch_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__sse41_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__SSE41_U8, inplace) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__sse41_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VORC__SSE41_U12, batch_eq_12) {
+    TEST_REQUIRES_X86_SSE41;
+    VBinaryCMicrokernelTester()
+      .batch_size(12)
+      .Test(xnn_s32_vorc_ukernel__sse41_u12, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__SSE41_U12, batch_div_12) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 24; batch_size < 120; batch_size += 12) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__sse41_u12, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__SSE41_U12, batch_lt_12) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size < 12; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__sse41_u12, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__SSE41_U12, batch_gt_12) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 13; batch_size < 24; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__sse41_u12, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__SSE41_U12, inplace) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__sse41_u12, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VORC__SSE41_U16, batch_eq_16) {
+    TEST_REQUIRES_X86_SSE41;
+    VBinaryCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_s32_vorc_ukernel__sse41_u16, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__SSE41_U16, batch_div_16) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__sse41_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__SSE41_U16, batch_lt_16) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__sse41_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__SSE41_U16, batch_gt_16) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__sse41_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__SSE41_U16, inplace) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__sse41_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VORC__AVX2_U8, batch_eq_8) {
+    TEST_REQUIRES_X86_AVX2;
+    VBinaryCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_s32_vorc_ukernel__avx2_u8, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__AVX2_U8, batch_div_8) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx2_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX2_U8, batch_lt_8) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx2_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX2_U8, batch_gt_8) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx2_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX2_U8, inplace) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__avx2_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VORC__AVX2_U16, batch_eq_16) {
+    TEST_REQUIRES_X86_AVX2;
+    VBinaryCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_s32_vorc_ukernel__avx2_u16, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__AVX2_U16, batch_div_16) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx2_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX2_U16, batch_lt_16) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx2_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX2_U16, batch_gt_16) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx2_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX2_U16, inplace) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__avx2_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VORC__AVX2_U24, batch_eq_24) {
+    TEST_REQUIRES_X86_AVX2;
+    VBinaryCMicrokernelTester()
+      .batch_size(24)
+      .Test(xnn_s32_vorc_ukernel__avx2_u24, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__AVX2_U24, batch_div_24) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx2_u24, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX2_U24, batch_lt_24) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx2_u24, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX2_U24, batch_gt_24) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx2_u24, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX2_U24, inplace) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__avx2_u24, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VORC__AVX2_U32, batch_eq_32) {
+    TEST_REQUIRES_X86_AVX2;
+    VBinaryCMicrokernelTester()
+      .batch_size(32)
+      .Test(xnn_s32_vorc_ukernel__avx2_u32, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__AVX2_U32, batch_div_32) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx2_u32, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX2_U32, batch_lt_32) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx2_u32, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX2_U32, batch_gt_32) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx2_u32, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX2_U32, inplace) {
+    TEST_REQUIRES_X86_AVX2;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__avx2_u32, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VORC__AVX512F_U16, batch_eq_16) {
+    TEST_REQUIRES_X86_AVX512F;
+    VBinaryCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_s32_vorc_ukernel__avx512f_u16, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__AVX512F_U16, batch_div_16) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX512F_U16, batch_lt_16) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX512F_U16, batch_gt_16) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX512F_U16, inplace) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VORC__AVX512F_U32, batch_eq_32) {
+    TEST_REQUIRES_X86_AVX512F;
+    VBinaryCMicrokernelTester()
+      .batch_size(32)
+      .Test(xnn_s32_vorc_ukernel__avx512f_u32, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__AVX512F_U32, batch_div_32) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u32, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX512F_U32, batch_lt_32) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u32, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX512F_U32, batch_gt_32) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u32, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX512F_U32, inplace) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u32, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VORC__AVX512F_U48, batch_eq_48) {
+    TEST_REQUIRES_X86_AVX512F;
+    VBinaryCMicrokernelTester()
+      .batch_size(48)
+      .Test(xnn_s32_vorc_ukernel__avx512f_u48, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__AVX512F_U48, batch_div_48) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 96; batch_size < 480; batch_size += 48) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u48, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX512F_U48, batch_lt_48) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size < 48; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u48, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX512F_U48, batch_gt_48) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 49; batch_size < 96; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u48, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX512F_U48, inplace) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 240; batch_size += 47) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u48, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(S32_VORC__AVX512F_U64, batch_eq_64) {
+    TEST_REQUIRES_X86_AVX512F;
+    VBinaryCMicrokernelTester()
+      .batch_size(64)
+      .Test(xnn_s32_vorc_ukernel__avx512f_u64, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__AVX512F_U64, batch_div_64) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 128; batch_size < 640; batch_size += 64) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u64, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX512F_U64, batch_lt_64) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size < 64; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u64, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX512F_U64, batch_gt_64) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 65; batch_size < 128; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u64, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__AVX512F_U64, inplace) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t batch_size = 1; batch_size <= 320; batch_size += 63) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__avx512f_u64, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  TEST(S32_VORC__WASMSIMD_U4, batch_eq_4) {
+    VBinaryCMicrokernelTester()
+      .batch_size(4)
+      .Test(xnn_s32_vorc_ukernel__wasmsimd_u4, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__WASMSIMD_U4, batch_div_4) {
+    for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u4, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__WASMSIMD_U4, batch_lt_4) {
+    for (size_t batch_size = 1; batch_size < 4; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u4, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__WASMSIMD_U4, batch_gt_4) {
+    for (size_t batch_size = 5; batch_size < 8; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u4, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__WASMSIMD_U4, inplace) {
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u4, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  TEST(S32_VORC__WASMSIMD_U8, batch_eq_8) {
+    VBinaryCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_s32_vorc_ukernel__wasmsimd_u8, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__WASMSIMD_U8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__WASMSIMD_U8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__WASMSIMD_U8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__WASMSIMD_U8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  TEST(S32_VORC__WASMSIMD_U12, batch_eq_12) {
+    VBinaryCMicrokernelTester()
+      .batch_size(12)
+      .Test(xnn_s32_vorc_ukernel__wasmsimd_u12, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__WASMSIMD_U12, batch_div_12) {
+    for (size_t batch_size = 24; batch_size < 120; batch_size += 12) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u12, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__WASMSIMD_U12, batch_lt_12) {
+    for (size_t batch_size = 1; batch_size < 12; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u12, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__WASMSIMD_U12, batch_gt_12) {
+    for (size_t batch_size = 13; batch_size < 24; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u12, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__WASMSIMD_U12, inplace) {
+    for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u12, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  TEST(S32_VORC__WASMSIMD_U16, batch_eq_16) {
+    VBinaryCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_s32_vorc_ukernel__wasmsimd_u16, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__WASMSIMD_U16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__WASMSIMD_U16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__WASMSIMD_U16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__WASMSIMD_U16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__wasmsimd_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(S32_VORC__NEON_U4, batch_eq_4) {
+    TEST_REQUIRES_ARM_NEON;
+    VBinaryCMicrokernelTester()
+      .batch_size(4)
+      .Test(xnn_s32_vorc_ukernel__neon_u4, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__NEON_U4, batch_div_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__neon_u4, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__NEON_U4, batch_lt_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size < 4; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__neon_u4, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__NEON_U4, batch_gt_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 5; batch_size < 8; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__neon_u4, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__NEON_U4, inplace) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__neon_u4, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(S32_VORC__NEON_U8, batch_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    VBinaryCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_s32_vorc_ukernel__neon_u8, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__NEON_U8, batch_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__neon_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__NEON_U8, batch_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__neon_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__NEON_U8, batch_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__neon_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__NEON_U8, inplace) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__neon_u8, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(S32_VORC__NEON_U12, batch_eq_12) {
+    TEST_REQUIRES_ARM_NEON;
+    VBinaryCMicrokernelTester()
+      .batch_size(12)
+      .Test(xnn_s32_vorc_ukernel__neon_u12, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__NEON_U12, batch_div_12) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 24; batch_size < 120; batch_size += 12) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__neon_u12, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__NEON_U12, batch_lt_12) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size < 12; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__neon_u12, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__NEON_U12, batch_gt_12) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 13; batch_size < 24; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__neon_u12, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__NEON_U12, inplace) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 60; batch_size += 11) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__neon_u12, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(S32_VORC__NEON_U16, batch_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    VBinaryCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_s32_vorc_ukernel__neon_u16, VBinaryCMicrokernelTester::OpType::ORC);
+  }
+
+  TEST(S32_VORC__NEON_U16, batch_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__neon_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__NEON_U16, batch_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__neon_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__NEON_U16, batch_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_s32_vorc_ukernel__neon_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+
+  TEST(S32_VORC__NEON_U16, inplace) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinaryCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_s32_vorc_ukernel__neon_u16, VBinaryCMicrokernelTester::OpType::ORC);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/s32-vorc.yaml b/test/s32-vorc.yaml
new file mode 100644
index 00000000000..79c15d7f047
--- /dev/null
+++ b/test/s32-vorc.yaml
@@ -0,0 +1,40 @@
+# Copyright 2024 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Scalar
+- name: xnn_s32_vorc_ukernel__scalar_u1
+- name: xnn_s32_vorc_ukernel__scalar_u2
+- name: xnn_s32_vorc_ukernel__scalar_u4
+- name: xnn_s32_vorc_ukernel__scalar_u8
+
+# x86 SSE41
+- name: xnn_s32_vorc_ukernel__sse41_u4
+- name: xnn_s32_vorc_ukernel__sse41_u8
+- name: xnn_s32_vorc_ukernel__sse41_u12
+- name: xnn_s32_vorc_ukernel__sse41_u16
+
+# x86 AVX2
+- name: xnn_s32_vorc_ukernel__avx2_u8
+- name: xnn_s32_vorc_ukernel__avx2_u16
+- name: xnn_s32_vorc_ukernel__avx2_u24
+- name: xnn_s32_vorc_ukernel__avx2_u32
+
+# x86 AVX512F
+- name: xnn_s32_vorc_ukernel__avx512f_u16
+- name: xnn_s32_vorc_ukernel__avx512f_u32
+- name: xnn_s32_vorc_ukernel__avx512f_u48
+- name: xnn_s32_vorc_ukernel__avx512f_u64
+
+# Wasm SIMD
+- name: xnn_s32_vorc_ukernel__wasmsimd_u4
+- name: xnn_s32_vorc_ukernel__wasmsimd_u8
+- name: xnn_s32_vorc_ukernel__wasmsimd_u12
+- name: xnn_s32_vorc_ukernel__wasmsimd_u16
+
+# ARM NEON
+- name: xnn_s32_vorc_ukernel__neon_u4
+- name: xnn_s32_vorc_ukernel__neon_u8
+- name: xnn_s32_vorc_ukernel__neon_u12
+- name: xnn_s32_vorc_ukernel__neon_u16
diff --git a/test/vbinary-microkernel-tester.cc b/test/vbinary-microkernel-tester.cc
index ea557db1bfb..a2f94227d00 100644
--- a/test/vbinary-microkernel-tester.cc
+++ b/test/vbinary-microkernel-tester.cc
@@ -324,6 +324,9 @@ void VBinaryMicrokernelTester::Test(
           // Overflow is the expected behaviour
           y_ref[i] = ((((int64_t) a_data[i] * (int64_t) b_data[i]) << 32) >> 32);
           break;
+        case OpType::OR:
+          y_ref[i] = a_data[i] | b_data[i];
+          break;
         case OpType::SqrDiff: {
           const int32_t diff = a_data[i] - b_data[i];
           y_ref[i] = diff * diff;
diff --git a/test/vbinary-microkernel-tester.h b/test/vbinary-microkernel-tester.h
index a0561eaadd5..446a0eeb116 100644
--- a/test/vbinary-microkernel-tester.h
+++ b/test/vbinary-microkernel-tester.h
@@ -26,6 +26,7 @@ class VBinaryMicrokernelTester {
     Max,
     Min,
     Mul,
+    OR,
     Sub,
     SqrDiff,
   };
diff --git a/test/vbinaryc-microkernel-tester.cc b/test/vbinaryc-microkernel-tester.cc
index d534f7d0672..49def1cf1c5 100644
--- a/test/vbinaryc-microkernel-tester.cc
+++ b/test/vbinaryc-microkernel-tester.cc
@@ -351,6 +351,9 @@ void VBinaryCMicrokernelTester::Test(
           // Overflow is the expected behaviour
           y_ref[i] = ((((int64_t) a_data[i] * (int64_t) b) << 32) >> 32);
           break;
+        case OpType::ORC:
+          y_ref[i] = a_data[i] | b;
+          break;
         case OpType::SqrDiffC: {
           const int32_t diff = a_data[i] - b;
           y_ref[i] = diff * diff;
diff --git a/test/vbinaryc-microkernel-tester.h b/test/vbinaryc-microkernel-tester.h
index 2bc1f5327c2..3b11207289e 100644
--- a/test/vbinaryc-microkernel-tester.h
+++ b/test/vbinaryc-microkernel-tester.h
@@ -27,6 +27,7 @@ class VBinaryCMicrokernelTester {
     MaxC,
     MinC,
     MulC,
+    ORC,
     SqrDiffC,
     SubC,
     RSubC,
diff --git a/tools/generate-vbinary-test.py b/tools/generate-vbinary-test.py
index 3a3796d95f7..678b0212464 100755
--- a/tools/generate-vbinary-test.py
+++ b/tools/generate-vbinary-test.py
@@ -32,7 +32,7 @@
 
 
 def split_ukernel_name(name):
-  match = re.fullmatch(r"xnn_(qu8|qs8|f16|f32|s32)_v(add|cmul|copysign|div|max|min|mul|sqrdiff|sub|addc|copysignc|rcopysignc|divc|rdivc|maxc|minc|mulc|sqrdiffc|subc|rsubc)(_(minmax|relu)(_(fp32|rndnu))?)?_ukernel__(.+)_u(\d+)(v)?", name)
+  match = re.fullmatch(r"xnn_(qu8|qs8|f16|f32|s32)_v(add|cmul|copysign|div|max|min|mul|sqrdiff|sub|addc|copysignc|rcopysignc|divc|rdivc|maxc|minc|mulc|sqrdiffc|subc|rsubc|or|orc)(_(minmax|relu)(_(fp32|rndnu))?)?_ukernel__(.+)_u(\d+)(v)?", name)
   if match is None:
     raise ValueError("Unexpected microkernel name: " + name)
   op_type = {
@@ -53,6 +53,8 @@ def split_ukernel_name(name):
     "maxc": "MaxC",
     "minc": "MinC",
     "mulc": "MulC",
+    "or": "OR",
+    "orc": "ORC",
     "sqrdiffc": "SqrDiffC",
     "subc": "SubC",
     "rsubc": "RSubC",