diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index 6c71909c47..3cd880622c 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -23,6 +23,7 @@ include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(TARGET_DIR reference)
 
 if(EXECUTORCH_CADENCE_CPU_RUNNER)
   include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
@@ -60,9 +61,6 @@ if(EXECUTORCH_CADENCE_CPU_RUNNER)
                                       ${_common_include_directories}
   )
 
-  set(TARGET_DIR reference)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
-
   target_link_libraries(
   cadence_runner
   executorch
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
index f1a5b6a50b..e7c16d0031 100644
--- a/backends/cadence/aot/functions.yaml
+++ b/backends/cadence/aot/functions.yaml
@@ -142,41 +142,6 @@
     - arg_meta: null
       kernel_name: torch::executor::where_out
 
-- op: transpose_copy.int_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::transpose_copy_int_out
-
-- op: eq.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::eq_scalar_out
-
-- op: logical_not.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::logical_not_out
-
-- op: any.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::any_out
-
-- op: native_group_norm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::native_group_norm_out
-
-- op: sum.IntList_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sum_dim_out
-
-- op: select_copy.int_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::select_copy_int_out
-
 # custom ops
 - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
@@ -218,18 +183,3 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::quantized_matmul_out
-
-- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::reference::quantized_linear_per_tensor_out
-
-- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::reference::im2row_out
-
-- func: cadence::quantized_conv.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::reference::quantized_conv_per_tensor_out
diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt
index a2d51af2c0..c40d3ff66b 100644
--- a/backends/cadence/reference/operators/CMakeLists.txt
+++ b/backends/cadence/reference/operators/CMakeLists.txt
@@ -55,16 +55,6 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_expand_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_transpose_copy.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_eq.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_logical_not.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_any.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_native_group_norm.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sum.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_select_copy.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/select_copy_util.cpp"
 )
 add_library(aten_ops_cadence ${_aten_ops__srcs})
 target_link_libraries(aten_ops_cadence PUBLIC executorch)
@@ -88,7 +78,6 @@ add_library(
   "quantize_per_tensor.cpp"
   "dequantize_per_tensor.cpp"
   "quantized_matmul_out.cpp"
-  "im2row_out.cpp"
 )
 target_include_directories(
   custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
diff --git a/backends/cadence/reference/operators/im2row_out.cpp b/backends/cadence/reference/operators/im2row_out.cpp
deleted file mode 100644
index dd539b6f9b..0000000000
--- a/backends/cadence/reference/operators/im2row_out.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
-
-#include <executorch/backends/cadence/reference/operators/operators.h>
-
-#include <algorithm>
-
-namespace impl {
-namespace reference {
-namespace native {
-
-using ::executorch::aten::IntArrayRef;
-using ::executorch::aten::ScalarType;
-using ::executorch::aten::Tensor;
-using ::executorch::runtime::KernelRuntimeContext;
-
-template <typename T>
-__attribute__((always_inline)) void im2row_(
-    const T* __restrict__ data_im,
-    const int32_t in_zero_point,
-    /* input parameters*/
-    const int32_t channels,
-    const int32_t height,
-    const int32_t width,
-    /* output parameters */
-    const int32_t out_height,
-    const int32_t out_width,
-    /* convolution parameters */
-    const int32_t kernel_h,
-    const int32_t kernel_w,
-    const int32_t pad_h,
-    const int32_t pad_w,
-    const int32_t stride_h,
-    const int32_t stride_w,
-    const int32_t dilation_h,
-    const int32_t dilation_w,
-    T* __restrict__ data_col,
-    bool channels_last) {
-  // Consider convolving the input image of dimensions channels * height * width
-  // (or height * width * channels for NHWC layout) with a filter of dimensions
-  // channels * kernels_h * kernels_w. Assume that this convolution will produce
-  // an output of dimensinos out_height x out_width. For each point the output,
-  // im2row takes the data from the input that is used in the computation of
-  // that output point, and flattens it into a vector of size channels_col =
-  // channels * kernel_h * kernel_w. The output of im2row will therefore be a 2D
-  // array of size (out_height * out_width) x channels_col
-  const int32_t channels_col = channels * kernel_h * kernel_w;
-
-  // If the layout is NHWC, we can copy 'channels' worth of contiguous data
-  // points when performing im2row.
-  if (channels_last) {
-    // Iterate over the output domain
-    for (int _h = 0; _h < out_height; ++_h) {
-      for (int _w = 0; _w < out_width; ++_w) {
-        int32_t i_col = _h * out_width + _w;
-        // Each point in the output domain is the result of applying a filter of
-        // size kernel_h x kernel_w x channels on the input. But since channels
-        // is contiguous, we will not explicitly have a loop for it.
-        for (int _kh = 0; _kh < kernel_h; ++_kh) {
-          int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
-          for (int _kw = 0; _kw < kernel_w; ++_kw) {
-            int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
-
-            // h_im and w_im are the actual height and width coordinates of the
-            // input tensor from where we need to copy 'channels' points.
-            const T* __restrict__ slice_im =
-                data_im + (h_im * width + w_im) * channels;
-            T* __restrict__ slice_col = data_col + i_col * channels_col +
-                (_kh * kernel_w + _kw) * channels;
-            // If the coordinates were within the input domain, we copy
-            // 'channels' contiguous values. Otherwise we will fill the output
-            // with 0's.
-            if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
-              std::memcpy(slice_col, slice_im, channels * sizeof(T));
-            } else {
-              std::fill_n(slice_col, channels, T(in_zero_point));
-            }
-          }
-        }
-      }
-    }
-  } else {
-    // Iterate over the output domain
-    for (int _h = 0; _h < out_height; ++_h) {
-      for (int _w = 0; _w < out_width; ++_w) {
-        int32_t i_col = _h * out_width + _w;
-
-        // Each point in the output domain is the result of applying a filter
-        // of size chanenls * kernel_h x kernel_w on the input
-        for (int _c = 0; _c < channels; ++_c) {
-          for (int _kh = 0; _kh < kernel_h; ++_kh) {
-            for (int _kw = 0; _kw < kernel_w; ++_kw) {
-              // c_col is the linearized access in the channels_col vector.
-              int32_t c_col = (_c * kernel_h + _kh) * kernel_w + _kw;
-              // h_im and w_im are the actual height and width coordinates of
-              // the input tensor that we need to copy to the output.
-              int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
-              int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
-              // If the current data access is within the input tensor, copy the
-              // value
-              data_col[i_col * channels_col + c_col] =
-                  (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
-                  ? data_im[(_c * height + h_im) * width + w_im]
-                  : static_cast<T>(in_zero_point);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void im2row_out(
-    __ET_UNUSED KernelRuntimeContext& ctx,
-    const Tensor& input,
-    IntArrayRef kernel_size,
-    IntArrayRef dilation,
-    IntArrayRef padding,
-    IntArrayRef stride,
-    const Tensor& in_zero_point,
-    bool channel_last,
-    Tensor& out) {
-  // Compute the input tensor's dims
-  bool unit_height = input.dim() == 3;
-  const int32_t batch_size = input.size(0);
-  const int32_t in_c =
-      channel_last ? input.size(3 - unit_height) : input.size(1);
-  const int32_t in_h =
-      unit_height ? 1 : (channel_last ? input.size(1) : input.size(2));
-  const int32_t in_w =
-      channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height);
-
-  // Get the kernel parameters
-  int32_t kernel_h = kernel_size[0];
-  int32_t kernel_w = kernel_size[1];
-  int32_t dilation_h = dilation[0];
-  int32_t dilation_w = dilation[1];
-  int32_t pad_h = padding[0];
-  int32_t pad_w = padding[1];
-  int32_t stride_h = stride[0];
-  int32_t stride_w = stride[1];
-
-  // If we were to apply a convolution on the input tensor, compute the output
-  // height and width.
-  int32_t out_h =
-      (in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
-  int32_t out_w =
-      (in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
-
-  ET_DCHECK_MSG(
-      (out_h * out_w) == out.size(1), "dimension mismatch for output");
-  ET_DCHECK_MSG(
-      (kernel_h * kernel_w * in_c) == out.size(2),
-      "dimension mismatch for output");
-
-  // Check if the input is per-tensor quantized or per-channel quantized. The
-  // zero point for each batch could differ for per-channel quantized input.
-  bool per_tensor_quantized = in_zero_point.numel() == 1;
-
-#define typed_im2row(dtype, ctype)                                     \
-  case ScalarType::dtype: {                                            \
-    const ctype* __restrict__ in_data = input.const_data_ptr<ctype>(); \
-    ctype* __restrict__ out_data = out.mutable_data_ptr<ctype>();      \
-    const int32_t* __restrict__ zero_point =                           \
-        in_zero_point.const_data_ptr<int32_t>();                       \
-    int32_t in_plane = in_c * in_h * in_w;                             \
-    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;    \
-    for (size_t n = 0; n < batch_size; ++n) {                          \
-      im2row_<ctype>(                                                  \
-          &in_data[n * in_plane],                                      \
-          per_tensor_quantized ? zero_point[0] : zero_point[n],        \
-          in_c,                                                        \
-          in_h,                                                        \
-          in_w,                                                        \
-          out_h,                                                       \
-          out_w,                                                       \
-          kernel_h,                                                    \
-          kernel_w,                                                    \
-          pad_h,                                                       \
-          pad_w,                                                       \
-          stride_h,                                                    \
-          stride_w,                                                    \
-          dilation_h,                                                  \
-          dilation_w,                                                  \
-          &out_data[n * out_plane],                                    \
-          channel_last);                                               \
-    }                                                                  \
-    break;                                                             \
-  }
-
-  ScalarType dtype = input.scalar_type();
-  switch (dtype) {
-    typed_im2row(Float, float);
-    typed_im2row(Byte, uint8_t);
-    typed_im2row(Char, int8_t);
-    default:
-      ET_DCHECK_MSG(
-          false,
-          "im2row not implemented for dtype %s",
-          torch::executor::toString(dtype));
-  }
-#undef typed_im2row
-}
-
-} // namespace native
-} // namespace reference
-} // namespace impl
diff --git a/backends/cadence/reference/operators/operators.h b/backends/cadence/reference/operators/operators.h
deleted file mode 100644
index 0ff4639255..0000000000
--- a/backends/cadence/reference/operators/operators.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
-
-#pragma once
-
-#include <executorch/runtime/core/array_ref.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-#include <optional>
-
-namespace cadence {
-namespace impl {
-namespace cpu {
-namespace native {
-namespace {
-using ::executorch::runtime::getLeadingDims;
-
-#define ET_FORALL_CADENCE_QUANTIZED_TYPES(_) \
-  _(uint8_t, Byte)                           \
-  _(int8_t, Char)
-
-inline __attribute__((always_inline)) void linear_(
-    const ::executorch::aten::Tensor& input,
-    const ::executorch::aten::Tensor& weight,
-    const ::executorch::aten::optional<::executorch::aten::Tensor>& bias,
-    ::executorch::aten::Tensor& output) {
-  const float* __restrict__ input_data = input.const_data_ptr<float>();
-  const float* __restrict__ weight_data = weight.const_data_ptr<float>();
-  const float* __restrict__ bias_data = bias.value().const_data_ptr<float>();
-  float* __restrict__ output_data = output.mutable_data_ptr<float>();
-
-  // input comes in shape [batch_size, in_dim]
-  // weight comes in shape [out_dim, in_dim]
-  // output comes in empty with shape [batch_size, out_dim]
-  // Perform matrix multiply (M x N) x (N x P) => M x P
-  int64_t M = weight.size(0); // = out_dim
-  int64_t N = weight.size(1); // = in_dim
-
-  // Given an N-dimensional input [d0, d1, d2, ..., d_{N-2}, d_{N-1}], the
-  // leading dimensions is d0 * d1 * ... * d_{N-2}
-  int64_t leading_dims = getLeadingDims(input, input.dim() - 1);
-
-  for (int i = 0; i < leading_dims; ++i) {
-    for (int j = 0; j < M; ++j) {
-      float sum = bias_data[j];
-      for (int k = 0; k < N; ++k) {
-        sum += input_data[i * N + k] * weight_data[j * N + k];
-      }
-      output_data[i * M + j] = sum;
-    }
-  }
-}
-
-} // namespace
-} // namespace native
-} // namespace cpu
-} // namespace impl
-} // namespace cadence
diff --git a/backends/cadence/reference/operators/quantized_conv_out.cpp b/backends/cadence/reference/operators/quantized_conv_out.cpp
index 5a7af85809..de19f3ef43 100644
--- a/backends/cadence/reference/operators/quantized_conv_out.cpp
+++ b/backends/cadence/reference/operators/quantized_conv_out.cpp
@@ -1,16 +1,21 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 
 #include <executorch/backends/cadence/reference/kernels/kernels.h>
-#include <executorch/backends/cadence/reference/operators/operators.h>
+
+#include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace impl {
 namespace reference {
 namespace native {
 
-using ::executorch::aten::IntArrayRef;
-using ::executorch::aten::ScalarType;
-using ::executorch::aten::Tensor;
-using ::executorch::runtime::KernelRuntimeContext;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
 
 // This implements a generic 2d conv kernel that operates on raw pointers.
 // The version handles both quantized and fp32 convolutions.
@@ -18,12 +23,7 @@ using ::executorch::runtime::KernelRuntimeContext;
 // The weight is of shape [oc x wc x wh x ww], where wc == c
 // The output is of shape [n x oc x oh x ow]
 // The bias is of shape [oc]
-template <
-    typename IT = float,
-    typename WT = IT,
-    typename BT = IT,
-    typename OT = IT,
-    bool quantized = false>
+template <typename IT, typename WT, typename BT, typename OT, bool quantized>
 __attribute__((noinline)) void conv2d_nchw_core_generic(
     // All the arrays
     const IT* __restrict__ p_in,
@@ -56,10 +56,11 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
     // input zero point
     IT in_zero_point = 0,
     // weight zero point
-    int32_t weight_zero_point = 0,
-    float bias_scale = 1,
+    const int32_t* __restrict__ weight_zero_point = nullptr,
+    const float* __restrict__ bias_scale = nullptr,
     float out_scale = 1,
-    OT out_zero_point = 0) {
+    OT out_zero_point = 0,
+    bool per_tensor_quantized = true) {
   float inv_out_scale = 1. / out_scale;
   bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0;
 
@@ -105,7 +106,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
                     int woff = _wh * ww + _ww;
                     float lhs = in_plane[ioff] - in_zero_point;
                     float rhs = weight_plane[woff] -
-                        (quantized ? weight_zero_point : 0);
+                        (quantized ? weight_zero_point[0] : 0);
                     acc += lhs * rhs;
                   }
                 }
@@ -125,7 +126,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
                       int woff = _wh * ww + _ww;
                       float lhs = in_plane[ioff] - in_zero_point;
                       float rhs = weight_plane[woff] -
-                          (quantized ? weight_zero_point : 0);
+                          (quantized ? weight_zero_point[0] : 0);
                       acc += lhs * rhs;
                     }
                   }
@@ -133,10 +134,11 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
               }
             }
             if (quantized) {
-              float val = bias_scale * acc;
+              float val =
+                  (per_tensor_quantized ? bias_scale[0] : bias_scale[_oc]) *
+                  acc;
               out_plane[_oh * ow + _ow] =
-                  ::impl::reference::kernels::quantize<OT>(
-                      val, inv_out_scale, out_zero_point);
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_plane[_oh * ow + _ow] = acc;
             }
@@ -147,149 +149,27 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
   }
 }
 
-template <
-    typename IT = float,
-    typename WT = IT,
-    typename BT = IT,
-    typename OT = IT,
-    bool quantized = false>
-__attribute__((noinline)) void conv2d_nhwc_core_generic(
-    // All the arrays
-    const IT* __restrict__ p_in,
-    const WT* __restrict__ p_weight,
-    const BT* __restrict__ p_bias,
-    OT* __restrict__ p_out,
-    // The array sizes
-    int32_t n,
-    int32_t h,
-    int32_t w,
-    int32_t c,
-    int32_t oc,
-    int32_t wh,
-    int32_t ww,
-    int32_t wc,
-    int32_t oh,
-    int32_t ow,
-    // Stride
-    int16_t s0,
-    int16_t s1,
-    // Padding
-    int16_t p0,
-    int16_t p1,
-    // Dilation
-    int16_t d0,
-    int16_t d1,
-    // Group for depthwise conv
-    int16_t groups,
-    // Optional args that are only relevant for quantized convolution
-    // input zero point
-    IT in_zero_point = 0,
-    // weight zero point
-    int32_t weight_zero_point = 0,
-    float bias_scale = 1,
-    float out_scale = 1,
-    OT out_zero_point = 0) {
-  float inv_out_scale = 1. / out_scale;
-  bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0;
-
-  // Compute the number of in and out channels per group
-  const int ocpg = oc / groups;
-  const int icpg = c / groups;
-
-  // Iterate over all the output batches (i.e., n)
-  for (int _n = 0; _n < n; ++_n) {
-    const IT* in_batch = p_in + _n * h * w * c;
-    OT* out_batch = p_out + _n * oh * ow * oc;
-    for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) {
-      for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) {
-        OT* out_line = out_batch + (_oh * ow + _ow) * oc;
-        // Compute separable convolution for each group
-        for (int _g = 0; _g < groups; ++_g) {
-          // Identify the input and output channels involved in the computation
-          // of this group
-          int sic = _g * icpg;
-          int soc = _g * ocpg;
-          // Populate all the output channels in the group
-          for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
-            const WT* weight_batch = p_weight + _oc * wh * ww * wc;
-            // We compute one output channel at a time. The computation can be
-            // thought of as a stencil computation: we iterate over an input of
-            // size h x w x icpg, with a stencil of size wh x ww x icpg, to
-            // compute an output channel of size oh x ow x 1.
-            float acc = p_bias[_oc];
-            // Below is the stencil computation that performs the hadamard
-            // product+accumulation of each input channel (contributing to
-            // the output channel being computed) with the corresponding
-            // weight channel. If the padding is 0, and dilation is 1, then
-            // we can remove the unnecessary checks, and simplify the code
-            // so that it can be vectorized by Tensilica compiler.x``
-            if (zero_pad_unit_dilation) {
-              for (int _wh = 0; _wh < wh; ++_wh) {
-                for (int _ww = 0; _ww < ww; ++_ww) {
-                  const IT* in_line =
-                      in_batch + (_h + _wh) * w * c + (_w + _ww) * c;
-                  const WT* weight_line =
-                      weight_batch + _wh * ww * wc + _ww * wc;
-                  for (int _ic = sic; _ic < sic + icpg; ++_ic) {
-                    float lhs = in_line[_ic] - in_zero_point;
-                    float rhs = weight_line[_ic - sic] -
-                        (quantized ? weight_zero_point : 0);
-                    acc += lhs * rhs;
-                  }
-                }
-              }
-            } else {
-              for (int _wh = 0; _wh < wh; ++_wh) {
-                for (int _ww = 0; _ww < ww; ++_ww) {
-                  if (((_h + d0 * _wh - p0) >= 0) &&
-                      ((_h + d0 * _wh - p0) < h) &&
-                      ((_w + d1 * _ww - p1) >= 0) &&
-                      ((_w + d1 * _ww - p1 < w))) {
-                    const IT* in_line = in_batch +
-                        (_h + d0 * _wh - p0) * w * c + (_w + d1 * _ww - p1) * c;
-                    const WT* weight_line =
-                        weight_batch + _wh * ww * wc + _ww * wc;
-                    for (int _ic = sic; _ic < sic + icpg; ++_ic) {
-                      float lhs = in_line[_ic] - in_zero_point;
-                      float rhs = weight_line[_ic - sic] -
-                          (quantized ? weight_zero_point : 0);
-                      acc += lhs * rhs;
-                    }
-                  }
-                }
-              }
-            }
-            if (quantized) {
-              float val = bias_scale * acc;
-              out_line[_oc] = ::impl::reference::kernels::quantize<OT>(
-                  val, inv_out_scale, out_zero_point);
-            } else {
-              out_line[_oc] = acc;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
 // The quantized convolution kernel. in_scale and weight_scale are implicit in
 // bias_scale, since it is a product of the two. The kernel will branch to
 // quantized::conv1d or quantized::conv2d based on the dimensionality of
 // activation tensor.
-void quantized_conv_nchw(
+void quantized_conv_out(
+    KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int16_t groups,
-    int32_t in_zero_point,
-    int32_t weight_zero_point,
-    float bias_scale,
-    float output_scale,
-    int32_t output_zero_point,
+    executorch::aten::IntArrayRef stride,
+    executorch::aten::IntArrayRef padding,
+    executorch::aten::IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    bool channel_last,
     Tensor& out) {
   bool conv1d = input.dim() == 3;
   // input = [n, c, h, w]
@@ -306,224 +186,76 @@ void quantized_conv_nchw(
   const int oh = conv1d ? 1 : out.size(2);
   const int ow = conv1d ? out.size(2) : out.size(3);
 
-#define typed_quantized_conv2d_nchw(ctype, dtype)                 \
-  case ScalarType::dtype: {                                       \
-    conv2d_nchw_core_generic<ctype, ctype, int32_t, ctype, true>( \
-        input.const_data_ptr<ctype>(),                            \
-        weight.const_data_ptr<ctype>(),                           \
-        bias.const_data_ptr<int32_t>(),                           \
-        out.mutable_data_ptr<ctype>(),                            \
-        n,                                                        \
-        c,                                                        \
-        h,                                                        \
-        w,                                                        \
-        oc,                                                       \
-        wc,                                                       \
-        wh,                                                       \
-        ww,                                                       \
-        oh,                                                       \
-        ow,                                                       \
-        stride[0],                                                \
-        stride[1],                                                \
-        padding[0],                                               \
-        padding[1],                                               \
-        dilation[0],                                              \
-        dilation[1],                                              \
-        groups,                                                   \
-        in_zero_point,                                            \
-        weight_zero_point,                                        \
-        bias_scale,                                               \
-        output_scale,                                             \
-        (ctype)output_zero_point);                                \
-    break;                                                        \
-  }
-  ScalarType dtype = out.scalar_type();
-  switch (dtype) {
-    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv2d_nchw);
-    default:
-      ET_DCHECK_MSG(
-          false, "Unhandled dtype %s", torch::executor::toString(dtype));
-  }
-
-#undef typed_quantized_conv2d_nchw
-}
-
-void quantized_conv_nhwc(
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int16_t groups,
-    int32_t in_zero_point,
-    int32_t weight_zero_point,
-    float bias_scale,
-    float output_scale,
-    int32_t output_zero_point,
-    Tensor& out) {
-  bool conv1d = input.dim() == 3;
-  // input = [n, h, w, c]
-  const int n = input.size(0);
-  const int h = conv1d ? 1 : input.size(1);
-  const int w = conv1d ? input.size(1) : input.size(2);
-  const int c = conv1d ? input.size(2) : input.size(3);
-  // weight = [oc, wh, ww, wc]
-  const int oc = weight.size(0);
-  const int wh = conv1d ? 1 : weight.size(1);
-  const int ww = conv1d ? weight.size(1) : weight.size(2);
-  const int wc = conv1d ? weight.size(2) : weight.size(3);
-  // output = [n, oh, ow, oc]
-  const int oh = conv1d ? 1 : out.size(1);
-  const int ow = conv1d ? out.size(1) : out.size(2);
-
-#define typed_quantized_conv2d_nhwc(ctype, dtype)                 \
-  case ScalarType::dtype: {                                       \
-    conv2d_nhwc_core_generic<ctype, ctype, int32_t, ctype, true>( \
-        input.const_data_ptr<ctype>(),                            \
-        weight.const_data_ptr<ctype>(),                           \
-        bias.const_data_ptr<int32_t>(),                           \
-        out.mutable_data_ptr<ctype>(),                            \
-        n,                                                        \
-        h,                                                        \
-        w,                                                        \
-        c,                                                        \
-        oc,                                                       \
-        wh,                                                       \
-        ww,                                                       \
-        wc,                                                       \
-        oh,                                                       \
-        ow,                                                       \
-        stride[0],                                                \
-        stride[1],                                                \
-        padding[0],                                               \
-        padding[1],                                               \
-        dilation[0],                                              \
-        dilation[1],                                              \
-        groups,                                                   \
-        in_zero_point,                                            \
-        weight_zero_point,                                        \
-        bias_scale,                                               \
-        output_scale,                                             \
-        (ctype)output_zero_point);                                \
-    break;                                                        \
-  }
-  ScalarType dtype = out.scalar_type();
-  switch (dtype) {
-    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv2d_nhwc);
-    default:
-      ET_DCHECK_MSG(
-          false, "Unhandled dtype %s", torch::executor::toString(dtype));
-  }
-
-#undef typed_quantized_conv2d_nhwc
-}
+  // Bool flag to check if weight tensor is quantized per-tensor or
+  // per-channel
+  bool per_tensor_quantized = bias_scale.numel() == 1;
 
-void quantized_conv_out(
-    __ET_UNUSED KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    const Tensor& weight_zero_point,
-    const Tensor& bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    __ET_UNUSED const Tensor& out_multiplier,
-    __ET_UNUSED const Tensor& out_shift,
-    bool channel_last,
-    Tensor& out) {
-  const float bias_scale_float = bias_scale.const_data_ptr<float>()[0];
-  const int32_t weight_zero_point_int =
-      weight_zero_point.const_data_ptr<int32_t>()[0];
-  if (channel_last) {
-    quantized_conv_nhwc(
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
+  if (out.scalar_type() == exec_aten::ScalarType::Byte) {
+    conv2d_nchw_core_generic<uint8_t, uint8_t, int32_t, uint8_t, true>(
+        input.const_data_ptr<uint8_t>(),
+        weight.const_data_ptr<uint8_t>(),
+        bias.const_data_ptr<int32_t>(),
+        out.mutable_data_ptr<uint8_t>(),
+        n,
+        c,
+        h,
+        w,
+        oc,
+        wc,
+        wh,
+        ww,
+        oh,
+        ow,
+        stride[0],
+        stride[1],
+        padding[0],
+        padding[1],
+        dilation[0],
+        dilation[1],
         groups,
         in_zero_point,
-        weight_zero_point_int,
-        bias_scale_float,
+        weight_zero_point.const_data_ptr<int32_t>(),
+        bias_scale.const_data_ptr<float>(),
         output_scale,
-        output_zero_point,
-        out);
-  } else {
-    quantized_conv_nchw(
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
+        (uint8_t)output_zero_point,
+        per_tensor_quantized);
+  } else if (out.scalar_type() == exec_aten::ScalarType::Char) {
+    conv2d_nchw_core_generic<int8_t, int8_t, int32_t, int8_t, true>(
+        input.const_data_ptr<int8_t>(),
+        weight.const_data_ptr<int8_t>(),
+        bias.const_data_ptr<int32_t>(),
+        out.mutable_data_ptr<int8_t>(),
+        n,
+        c,
+        h,
+        w,
+        oc,
+        wc,
+        wh,
+        ww,
+        oh,
+        ow,
+        stride[0],
+        stride[1],
+        padding[0],
+        padding[1],
+        dilation[0],
+        dilation[1],
         groups,
         in_zero_point,
-        weight_zero_point_int,
-        bias_scale_float,
+        weight_zero_point.const_data_ptr<int32_t>(),
+        bias_scale.const_data_ptr<float>(),
         output_scale,
-        output_zero_point,
-        out);
-  }
-}
-
-void quantized_conv_per_tensor_out(
-    __ET_UNUSED KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    __ET_UNUSED int64_t out_multiplier,
-    __ET_UNUSED int64_t out_shift,
-    bool channel_last,
-    Tensor& out) {
-  if (channel_last) {
-    quantized_conv_nhwc(
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        groups,
-        in_zero_point,
-        weight_zero_point,
-        bias_scale,
-        output_scale,
-        output_zero_point,
-        out);
+        (int8_t)output_zero_point,
+        per_tensor_quantized);
   } else {
-    quantized_conv_nchw(
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        groups,
-        in_zero_point,
-        weight_zero_point,
-        bias_scale,
-        output_scale,
-        output_zero_point,
-        out);
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
   }
 }
 
-} // namespace native
-} // namespace reference
-} // namespace impl
+}; // namespace native
+}; // namespace reference
+}; // namespace impl
diff --git a/backends/cadence/reference/operators/quantized_linear_out.cpp b/backends/cadence/reference/operators/quantized_linear_out.cpp
index 4f7ca9cc3c..7bb1bf6fb4 100644
--- a/backends/cadence/reference/operators/quantized_linear_out.cpp
+++ b/backends/cadence/reference/operators/quantized_linear_out.cpp
@@ -6,8 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/reference/operators/operators.h>
-#include <executorch/backends/cadence/reference/operators/quantized_ops.h>
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace impl {
@@ -86,7 +85,6 @@ void quantized_linear_out(
     int64_t out_zero_point,
     __ET_UNUSED const executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
-  // TODO: refactor to use switch case as quantized_linear_per_tensor_out
   if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
     _typed_quantized_linear<uint8_t>(
         src,
@@ -117,43 +115,6 @@ void quantized_linear_out(
   }
 }
 
-void quantized_linear_per_tensor_out(
-    __ET_UNUSED KernelRuntimeContext& ctx,
-    const Tensor& src,
-    const Tensor& weight,
-    const Tensor& bias,
-    const int64_t src_zero_point,
-    const int64_t weight_zero_point,
-    const int64_t out_multiplier,
-    const int64_t out_shift,
-    const int64_t out_zero_point,
-    __ET_UNUSED const executorch::aten::optional<Tensor>& offset,
-    Tensor& out) {
-#define typed_quantized_linear_per_tensor(ctype, dtype) \
-  case executorch::aten::ScalarType::dtype: {           \
-    quantized_linear_per_tensor_<ctype>(                \
-        src,                                            \
-        weight,                                         \
-        bias,                                           \
-        src_zero_point,                                 \
-        weight_zero_point,                              \
-        out_multiplier,                                 \
-        out_shift,                                      \
-        out_zero_point,                                 \
-        out);                                           \
-    break;                                              \
-  }
-
-  executorch::aten::ScalarType dtype = out.scalar_type();
-  switch (dtype) {
-    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_linear_per_tensor);
-    default:
-      ET_DCHECK_MSG(
-          false, "Unhandled dtype %s", executorch::runtime::toString(dtype));
-  }
-#undef typed_quantized_linear_per_tensor
-}
-
 }; // namespace native
 }; // namespace reference
 }; // namespace impl
diff --git a/backends/cadence/reference/operators/quantized_ops.h b/backends/cadence/reference/operators/quantized_ops.h
deleted file mode 100644
index 66545c8e58..0000000000
--- a/backends/cadence/reference/operators/quantized_ops.h
+++ /dev/null
@@ -1,190 +0,0 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
-
-#pragma once
-
-#include <executorch/backends/cadence/reference/kernels/kernels.h>
-#include <executorch/backends/cadence/reference/operators/operators.h>
-
-template <typename T>
-inline __attribute__((always_inline)) void quantized_linear_per_tensor_(
-    const ::executorch::aten::Tensor& src,
-    const ::executorch::aten::Tensor& weight,
-    const ::executorch::aten::Tensor& bias,
-    const int64_t src_zero_point,
-    const int64_t weight_zero_point,
-    const int64_t out_multiplier,
-    const int64_t out_shift,
-    const int64_t out_zero_point,
-    ::executorch::aten::Tensor& out) {
-  // input comes in shape [leading_dims, in_dim]
-  // weight comes in shape [out_dim, in_dim]
-  // output comes in empty with shape [leading_dims, out_dim]
-  // Perform matrix multiply (M x N) x (N x P)' => M x P
-  const int64_t leading_dims =
-      executorch::runtime::getLeadingDims(src, src.dim() - 1);
-  const int64_t out_dim = weight.size(0); // = out_dim
-  const int64_t in_dim = weight.size(1); // = in_dim
-
-  const T* __restrict__ in_data = src.const_data_ptr<T>();
-  const T* __restrict__ weight_data = weight.const_data_ptr<T>();
-  const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
-  T* __restrict__ out_data = out.mutable_data_ptr<T>();
-
-  // Compute the requant_scale from out_multiplier and out_shift
-  const float requant_scale =
-      -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift);
-
-  for (size_t i = 0; i < leading_dims; ++i) {
-    for (size_t j = 0; j < out_dim; ++j) {
-      int32_t sum = bias_data[j];
-      for (size_t k = 0; k < in_dim; ++k) {
-        int32_t x = (int32_t)in_data[i * in_dim + k] - src_zero_point;
-        int32_t w =
-            (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point;
-        sum += x * w;
-      }
-      out_data[i * out_dim + j] = ::impl::reference::kernels::quantize<T>(
-          sum, requant_scale, out_zero_point);
-    }
-  }
-}
-
-template <typename T>
-inline __attribute__((always_inline)) void quantized_linear_per_tensor_(
-    const ::executorch::aten::Tensor& src,
-    const ::executorch::aten::Tensor& weight,
-    const ::executorch::aten::Tensor& bias,
-    int64_t src_zero_point,
-    const ::executorch::aten::Tensor& weight_zero_point_t,
-    int64_t out_multiplier,
-    int64_t out_shift,
-    int64_t out_zero_point,
-    ::executorch::aten::Tensor& out) {
-  // Get the zero_point of weight.
-  int32_t weight_zero_point = weight_zero_point_t.const_data_ptr<int32_t>()[0];
-  quantized_linear_per_tensor_<T>(
-      src,
-      weight,
-      bias,
-      src_zero_point,
-      weight_zero_point,
-      out_multiplier,
-      out_shift,
-      out_zero_point,
-      out);
-}
-
-template <typename T>
-inline __attribute__((always_inline)) void quantized_linear_per_channel_(
-    const ::executorch::aten::Tensor& src,
-    const ::executorch::aten::Tensor& weight,
-    const ::executorch::aten::Tensor& bias,
-    int64_t src_zero_point,
-    int64_t weight_zero_point,
-    const ::executorch::aten::Tensor& out_multiplier,
-    const ::executorch::aten::Tensor& out_shift,
-    int64_t out_zero_point,
-    ::executorch::aten::Tensor& out) {
-  // input comes in shape [leading_dims, in_dim]
-  // weight comes in shape [out_dim, in_dim]
-  // output comes in empty with shape [leading_dims, out_dim]
-  // Perform matrix multiply (M x N) x (N x P)' => M x P
-  int64_t leading_dims =
-      executorch::runtime::getLeadingDims(src, src.dim() - 1);
-  const int64_t out_dim = weight.size(0); // = out_dim
-  const int64_t in_dim = weight.size(1); // = in_dim
-
-  const T* __restrict__ in_data = src.const_data_ptr<T>();
-  const T* __restrict__ weight_data = weight.const_data_ptr<T>();
-  const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
-  T* __restrict__ out_data = out.mutable_data_ptr<T>();
-  const int32_t* __restrict__ out_multiplier_data =
-      out_multiplier.const_data_ptr<int32_t>();
-  const int32_t* __restrict__ out_shift_data =
-      out_shift.const_data_ptr<int32_t>();
-
-  for (size_t i = 0; i < leading_dims; ++i) {
-    for (size_t j = 0; j < out_dim; ++j) {
-      int32_t sum = bias_data[j];
-      for (size_t k = 0; k < in_dim; ++k) {
-        int32_t x = (int32_t)in_data[i * in_dim + k] - src_zero_point;
-        int32_t w =
-            (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point;
-        sum += x * w;
-      }
-      // Compute the out_scale from out_multiplier and out_shift
-      const float out_scale =
-          -out_multiplier_data[j] * 1.0 / (1 << 31) * pow(2, out_shift_data[j]);
-      out_data[i * out_dim + j] = ::impl::reference::kernels::quantize<T>(
-          sum, out_scale, out_zero_point);
-    }
-  }
-}
-
-template <typename T>
-inline __attribute__((always_inline)) void quantized_linear_(
-    const ::executorch::aten::Tensor& src,
-    const ::executorch::aten::Tensor& weight,
-    const ::executorch::aten::Tensor& bias,
-    int64_t src_zero_point,
-    int64_t weight_zero_point,
-    const ::executorch::aten::Tensor& out_multiplier,
-    const ::executorch::aten::Tensor& out_shift,
-    int64_t out_zero_point,
-    ::executorch::aten::Tensor& out) {
-  if (out_multiplier.numel() == 1) {
-    // Use per-tensor quantization kernel.
-    const int32_t* __restrict__ out_multiplier_data =
-        out_multiplier.const_data_ptr<int32_t>();
-    const int32_t* __restrict__ out_shift_data =
-        out_shift.const_data_ptr<int32_t>();
-    quantized_linear_per_tensor_<T>(
-        src,
-        weight,
-        bias,
-        src_zero_point,
-        weight_zero_point,
-        out_multiplier_data[0],
-        out_shift_data[0],
-        out_zero_point,
-        out);
-    return;
-  }
-
-  // Use per-channel quantization kernel.
-  quantized_linear_per_channel_<T>(
-      src,
-      weight,
-      bias,
-      src_zero_point,
-      weight_zero_point,
-      out_multiplier,
-      out_shift,
-      out_zero_point,
-      out);
-}
-
-template <typename T>
-inline __attribute__((always_inline)) void quantized_linear_(
-    const ::executorch::aten::Tensor& src,
-    const ::executorch::aten::Tensor& weight,
-    const ::executorch::aten::Tensor& bias,
-    int64_t src_zero_point,
-    const ::executorch::aten::Tensor& weight_zero_point_t,
-    const ::executorch::aten::Tensor& out_multiplier,
-    const ::executorch::aten::Tensor& out_shift,
-    int64_t out_zero_point,
-    ::executorch::aten::Tensor& out) {
-  // Get the zero_point of weight.
-  int32_t weight_zero_point = weight_zero_point_t.const_data_ptr<int32_t>()[0];
-  quantized_linear_<T>(
-      src,
-      weight,
-      bias,
-      src_zero_point,
-      weight_zero_point,
-      out_multiplier,
-      out_shift,
-      out_zero_point,
-      out);
-}
diff --git a/backends/cadence/reference/operators/targets.bzl b/backends/cadence/reference/operators/targets.bzl
index 488aeebb82..347d476239 100644
--- a/backends/cadence/reference/operators/targets.bzl
+++ b/backends/cadence/reference/operators/targets.bzl
@@ -7,9 +7,6 @@ def define_common_targets():
         srcs = glob([
             "*.cpp",
         ]),
-        exported_headers =glob([
-            "*.h",
-        ]),
         platforms = CXX,
         deps = [
             "//executorch/kernels/portable/cpu/util:broadcast_util",