diff --git a/scripts/compare/compare.py b/scripts/compare/compare.py
index c235fd1bf..3f8b0f766 100644
--- a/scripts/compare/compare.py
+++ b/scripts/compare/compare.py
@@ -23,6 +23,7 @@ def parse_args():
         args.actual,
     )
 
+
 def getDiff(base, test):
     absolute_diff = np.subtract(base, test)
     max_absolute_diff = np.max(np.abs(absolute_diff))
@@ -35,16 +36,19 @@ def getDiff(base, test):
 
     return max_absolute_diff, max_relative_diff
 
-def compare_npy(actual_path, expect_path, edge, node):
+
+def compare_npy(node, actual_path, expect_path):
     actual = np.load(actual_path)
     expect = np.load(expect_path)
     if np.isnan(actual).any():
-        print(f"NAN value in node:{node} edge:{edge}")
+        print(f"NAN value in node:{node}\t{actual_path}\t{expect_path}")
         return
-    
+
     max_absolute_diff, max_relative_diff = getDiff(expect, actual)
-    if max_absolute_diff != 0.0: ## No need to print tensor with no diff
-        print(f'{max_absolute_diff}\t{max_relative_diff}\t{node}\t{edge}')
+    if max_absolute_diff != 0.0:  ## No need to print tensor with no diff
+        print(
+            f"{max_absolute_diff}\t{max_relative_diff}\t{node}\t{actual_path}\t{expect_path}"
+        )
 
 
 def main():
@@ -70,9 +74,7 @@ def main():
                     expect_file = expect_file + ".npy"
                     expect_file_path = os.path.join(expect_dir, expect_file)
                     if os.path.exists(expect_file_path):
-                        compare_npy(
-                            actual_file_path, expect_file_path, edge_name, node_name
-                        )
+                        compare_npy(meta_file, actual_file_path, expect_file_path)
 
 
 if __name__ == "__main__":
diff --git a/src/04kernel/include/kernel/attributes/broadcaster.h b/src/04kernel/include/kernel/attributes/broadcaster.h
index 006fa9adb..b95c45680 100644
--- a/src/04kernel/include/kernel/attributes/broadcaster.h
+++ b/src/04kernel/include/kernel/attributes/broadcaster.h
@@ -15,6 +15,7 @@ namespace refactor::kernel {
         explicit Broadcaster(std::vector<slice_t<dim_t>>);
         explicit Broadcaster(TensorRefs const &inputs);
         void locate(dim_t k, dim_t ans[]) const noexcept;
+        bool needBroadcast() const noexcept;
     };
 
 }// namespace refactor::kernel
diff --git a/src/04kernel/include/kernel/attributes/matmul_info.h b/src/04kernel/include/kernel/attributes/mat_mul_info.h
similarity index 62%
rename from src/04kernel/include/kernel/attributes/matmul_info.h
rename to src/04kernel/include/kernel/attributes/mat_mul_info.h
index 831c1f9a3..95440ba2f 100644
--- a/src/04kernel/include/kernel/attributes/matmul_info.h
+++ b/src/04kernel/include/kernel/attributes/mat_mul_info.h
@@ -1,9 +1,8 @@
-#ifndef KERNEL_MATMUL_INFO_H
-#define KERNEL_MATMUL_INFO_H
+#ifndef KERNEL_MAT_MUL_INFO_H
+#define KERNEL_MAT_MUL_INFO_H
 
 #include "kernel/attributes/broadcaster.h"
 #include "kernel/attributes/expand_info.h"
-#include <variant>
 
 namespace refactor::kernel {
 
@@ -11,11 +10,11 @@ namespace refactor::kernel {
         DataType dataType;
         float alpha, beta;
         bool transA, transB;
-        size_t m, k, n;
+        dim_t m, k, n;
         // Expand operation info for biasd
         std::optional<ExpandInfo> biasExpand;
-        // A constant batch or a 2-directional broadcaster that deals with dimensions before the last 2 dimensions
-        std::variant<Broadcaster, size_t> broadcasterOrBatch;
+        // A 2-directional broadcaster that deals with dimensions before the last 2 dimensions
+        Broadcaster broadcaster;
 
         MatMulInfo(Tensor const &, Tensor const &,
                    std::optional<std::reference_wrapper<Tensor const>>,
@@ -24,4 +23,4 @@ namespace refactor::kernel {
 
 }// namespace refactor::kernel
 
-#endif// KERNEL_MATMUL_INFO_H
+#endif// KERNEL_MAT_MUL_INFO_H
diff --git a/src/04kernel/include/kernel/attributes/mat_mul_integer_info.h b/src/04kernel/include/kernel/attributes/mat_mul_integer_info.h
new file mode 100644
index 000000000..b7d8d4759
--- /dev/null
+++ b/src/04kernel/include/kernel/attributes/mat_mul_integer_info.h
@@ -0,0 +1,28 @@
+#ifndef KERNEL_MAT_MUL_INTEGER_INFO_H
+#define KERNEL_MAT_MUL_INTEGER_INFO_H
+
+#include "kernel/attributes/broadcaster.h"
+
+namespace refactor::kernel {
+
+    struct MatMulIntegerInfo {
+        struct Input {
+            bool
+                withZeroPoint,
+                signed_,
+                scalar;
+
+            Input(TensorRefs const &, size_t i) noexcept;
+        };
+
+        Input a, b;
+        dim_t m, k, n;
+        Broadcaster broadcaster;
+
+        explicit MatMulIntegerInfo(TensorRefs const &inputs) noexcept;
+        dim_t batch() const noexcept;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_MAT_MUL_INTEGER_INFO_H
diff --git a/src/04kernel/include/kernel/collectors/dequantize_linear.h b/src/04kernel/include/kernel/collectors/dequantize_linear.h
new file mode 100644
index 000000000..640a4329a
--- /dev/null
+++ b/src/04kernel/include/kernel/collectors/dequantize_linear.h
@@ -0,0 +1,18 @@
+﻿#ifndef KERNEL_DEQUANTIZE_LINEAR_H
+#define KERNEL_DEQUANTIZE_LINEAR_H
+
+#include "../collector.h"
+
+namespace refactor::kernel {
+
+    struct DequantizeLinearCollector final : public InfoCollector {
+
+        explicit DequantizeLinearCollector(decltype(_target)) noexcept;
+
+        std::vector<KernelBox>
+        filter(TensorRefs inputs, TensorRefs outputs) const final;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_DEQUANTIZE_LINEAR_H
diff --git a/src/04kernel/include/kernel/collectors/dynamic_quantize_linear.h b/src/04kernel/include/kernel/collectors/dynamic_quantize_linear.h
new file mode 100644
index 000000000..52a2a0445
--- /dev/null
+++ b/src/04kernel/include/kernel/collectors/dynamic_quantize_linear.h
@@ -0,0 +1,18 @@
+﻿#ifndef KERNEL_DYNAMIC_QUANTIZE_LINEAR_H
+#define KERNEL_DYNAMIC_QUANTIZE_LINEAR_H
+
+#include "../collector.h"
+
+namespace refactor::kernel {
+
+    struct DynamicQuantizeLinearCollector final : public InfoCollector {
+
+        explicit DynamicQuantizeLinearCollector(decltype(_target)) noexcept;
+
+        std::vector<KernelBox>
+        filter(TensorRefs inputs, TensorRefs outputs) const final;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_DYNAMIC_QUANTIZE_LINEAR_H
diff --git a/src/04kernel/include/kernel/collectors/mat_mul_integer.h b/src/04kernel/include/kernel/collectors/mat_mul_integer.h
new file mode 100644
index 000000000..3534b0125
--- /dev/null
+++ b/src/04kernel/include/kernel/collectors/mat_mul_integer.h
@@ -0,0 +1,19 @@
+﻿#ifndef KERNEL_MAT_MUL_INTEGER_H
+#define KERNEL_MAT_MUL_INTEGER_H
+
+#include "../collector.h"
+
+namespace refactor::kernel {
+
+    struct MatMulIntegerCollector final : public InfoCollector {
+
+        constexpr MatMulIntegerCollector(decltype(_target) target) noexcept
+            : InfoCollector(target) {}
+
+        std::vector<KernelBox>
+        filter(TensorRefs inputs, TensorRefs outputs) const final;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_MAT_MUL_INTEGER_H
diff --git a/src/04kernel/src/attributes/broadcaster.cc b/src/04kernel/src/attributes/broadcaster.cc
index c35fb3ae1..d0c4bea73 100644
--- a/src/04kernel/src/attributes/broadcaster.cc
+++ b/src/04kernel/src/attributes/broadcaster.cc
@@ -96,4 +96,8 @@ namespace refactor::kernel {
         }
     }
 
+    bool Broadcaster::needBroadcast() const noexcept {
+        return !strides.empty();
+    }
+
 }// namespace refactor::kernel
diff --git a/src/04kernel/src/attributes/matmul_info.cc b/src/04kernel/src/attributes/mat_mul_info.cc
similarity index 66%
rename from src/04kernel/src/attributes/matmul_info.cc
rename to src/04kernel/src/attributes/mat_mul_info.cc
index 5f1162072..3619e9b59 100644
--- a/src/04kernel/src/attributes/matmul_info.cc
+++ b/src/04kernel/src/attributes/mat_mul_info.cc
@@ -1,10 +1,8 @@
-#include "kernel/attributes/matmul_info.h"
-#include <cstddef>
-#include <numeric>
+#include "kernel/attributes/mat_mul_info.h"
 
 namespace refactor::kernel {
 
-    ExpandInfo buildBias(size_t m, size_t n,
+    ExpandInfo buildBias(dim_t m, dim_t n,
                          Tensor const &a,
                          Tensor const &b,
                          Tensor const &c) {
@@ -12,8 +10,8 @@ namespace refactor::kernel {
         auto it = output.rbegin();
         *it++ = n;
         *it++ = m;
-        for (auto da = static_cast<size_t>(a.rank() - 2),
-                  db = static_cast<size_t>(b.rank() - 2);
+        for (auto da = static_cast<dim_t>(a.rank() - 2),
+                  db = static_cast<dim_t>(b.rank() - 2);
              auto i : range0_(output.size() - 2)) {
             auto a_ = i < da ? a.shape[da - i - 1] : 1;
             auto b_ = i < db ? b.shape[db - i - 1] : 1;
@@ -26,13 +24,6 @@ namespace refactor::kernel {
             slice(output.data(), output.size()));
     }
 
-    std::variant<Broadcaster, size_t> buildBroadcasterOrBatch(slice_t<dim_t> dimA, slice_t<dim_t> dimB) {
-        if (std::equal(dimA.begin(), dimA.end(), dimB.begin(), dimB.end())) {
-            return std::accumulate(dimA.begin(), dimA.end(), (size_t) 1, std::multiplies<size_t>());
-        }
-        return Broadcaster({dimA, dimB});
-    }
-
     MatMulInfo::MatMulInfo(
         Tensor const &a, Tensor const &b,
         std::optional<std::reference_wrapper<Tensor const>> c,
@@ -44,7 +35,8 @@ namespace refactor::kernel {
           k(transA ? a.shape.rbegin()[1] : a.shape.rbegin()[0]),
           n(transB ? b.shape.rbegin()[1] : b.shape.rbegin()[0]),
           biasExpand(c ? std::make_optional(buildBias(m, n, a, b, *c)) : std::nullopt),
-          broadcasterOrBatch(buildBroadcasterOrBatch(slice(a.shape.data(), a.shape.size() - 2), slice(b.shape.data(), b.shape.size() - 2))) {
+          broadcaster({slice(a.shape.data(), a.shape.size() - 2),
+                       slice(b.shape.data(), b.shape.size() - 2)}) {
         auto kB = transB ? b.shape.rbegin()[0] : b.shape.rbegin()[1];
         ASSERT(k == kB, "MatMul: input shape not matched.");
     }
diff --git a/src/04kernel/src/attributes/mat_mul_integer_info.cc b/src/04kernel/src/attributes/mat_mul_integer_info.cc
new file mode 100644
index 000000000..a16689fe2
--- /dev/null
+++ b/src/04kernel/src/attributes/mat_mul_integer_info.cc
@@ -0,0 +1,42 @@
+#include "kernel/attributes/mat_mul_integer_info.h"
+
+namespace refactor::kernel {
+
+    MatMulIntegerInfo::Input::Input(TensorRefs const &inputs, size_t i) noexcept
+        : withZeroPoint(false),
+          signed_(true),
+          scalar(true) {
+        if (inputs.size() > i + 2) {
+            auto const &t = inputs[i + 2].get();
+            auto size = t.elementsSize();
+            if (t.data) {
+                auto data = slice(t.data->get<uint8_t>(), size);
+                if (std::all_of(data.begin(), data.end(), [](auto x) { return x == 0; })) {
+                    return;
+                }
+            }
+            withZeroPoint = true;
+            signed_ = t.dataType == DataType::I8;
+            scalar = size == 1;
+        }
+    }
+
+    MatMulIntegerInfo::MatMulIntegerInfo(TensorRefs const &inputs) noexcept
+        : a(inputs, 0),
+          b(inputs, 1),
+#define A (inputs[0].get().shape)
+#define B (inputs[1].get().shape)
+          m(A.rbegin()[1]),
+          k(A.rbegin()[0]),
+          n(B.rbegin()[0]),
+          broadcaster({slice(A.data(), A.size() - 2),
+                       slice(B.data(), B.size() - 2)}) {
+    }
+#undef A
+#undef B
+
+    dim_t MatMulIntegerInfo::batch() const noexcept {
+        return broadcaster.outputsCount;
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/collectors/dequantize_linear.cc b/src/04kernel/src/collectors/dequantize_linear.cc
new file mode 100644
index 000000000..78fc939a4
--- /dev/null
+++ b/src/04kernel/src/collectors/dequantize_linear.cc
@@ -0,0 +1,32 @@
+﻿#include "kernel/collectors/dequantize_linear.h"
+#include "../kernels/dequantize_linear/cpu_kernel.hh"
+#include "../kernels/dequantize_linear/cuda_kernel.hh"
+
+namespace refactor::kernel {
+
+    DequantizeLinearCollector::
+        DequantizeLinearCollector(decltype(_target) target) noexcept
+        : InfoCollector(target) {}
+
+    std::vector<KernelBox>
+    DequantizeLinearCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
+        auto const &output = outputs[0];
+        std::vector<KernelBox> ans;
+        switch (_target) {
+            case decltype(_target)::Cpu:
+                if (auto ptr = DequantizeLinearCpu::build(inputs, output); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            case decltype(_target)::Nvidia:
+                if (auto ptr = DequantizeLinearCuda::build(inputs, output); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            default:
+                UNREACHABLEX(void, "Unknown target");
+        }
+        return ans;
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/collectors/dynamic_quantize_linear.cc b/src/04kernel/src/collectors/dynamic_quantize_linear.cc
new file mode 100644
index 000000000..8d54eee03
--- /dev/null
+++ b/src/04kernel/src/collectors/dynamic_quantize_linear.cc
@@ -0,0 +1,33 @@
+﻿#include "kernel/collectors/dynamic_quantize_linear.h"
+#include "../kernels/dynamic_quantize_linear/cpu_kernel.hh"
+#include "../kernels/dynamic_quantize_linear/cuda_kernel.hh"
+
+namespace refactor::kernel {
+
+    DynamicQuantizeLinearCollector::
+        DynamicQuantizeLinearCollector(decltype(_target) target) noexcept
+        : InfoCollector(target) {}
+
+    std::vector<KernelBox>
+    DynamicQuantizeLinearCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
+        auto size = inputs[0].get().elementsSize();
+
+        std::vector<KernelBox> ans;
+        switch (_target) {
+            case decltype(_target)::Cpu:
+                if (auto ptr = DynamicQuantizeLinearCpu::build(size); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            case decltype(_target)::Nvidia:
+                if (auto ptr = DynamicQuantizeLinearCuda::build(size); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            default:
+                UNREACHABLEX(void, "Unknown target");
+        }
+        return ans;
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/collectors/mat_mul.cc b/src/04kernel/src/collectors/mat_mul.cc
index 5a72240f8..7581200cd 100644
--- a/src/04kernel/src/collectors/mat_mul.cc
+++ b/src/04kernel/src/collectors/mat_mul.cc
@@ -1,8 +1,7 @@
 ﻿#include "kernel/collectors/mat_mul.h"
 #include "../kernels/mat_mul/cpu_kernel.hh"
 #include "../kernels/mat_mul/cublas_kernel.hh"
-#include "common.h"
-#include "kernel/attributes/matmul_info.h"
+#include "kernel/attributes/mat_mul_info.h"
 
 namespace refactor::kernel {
 #define REGISTER(T)                       \
diff --git a/src/04kernel/src/collectors/mat_mul_integer.cc b/src/04kernel/src/collectors/mat_mul_integer.cc
new file mode 100644
index 000000000..c0124de91
--- /dev/null
+++ b/src/04kernel/src/collectors/mat_mul_integer.cc
@@ -0,0 +1,30 @@
+﻿#include "kernel/collectors/mat_mul_integer.h"
+#include "../../src/kernels/mat_mul_integer/cpu_kernel.hh"
+#include "../../src/kernels/mat_mul_integer/cublas_kernel.hh"
+#include "kernel/attributes/mat_mul_integer_info.h"
+
+namespace refactor::kernel {
+
+    std::vector<KernelBox>
+    MatMulIntegerCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
+        MatMulIntegerInfo info(inputs);
+
+        std::vector<KernelBox> ans;
+        switch (_target) {
+            case decltype(_target)::Cpu:
+                if (auto ptr = MatMulIntegerCpu::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            case decltype(_target)::Nvidia:
+                if (auto ptr = MatMulIntegerCublas::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            default:
+                UNREACHABLEX(void, "Unknown target");
+        }
+        return ans;
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/conv/cudnn_kernel.cc b/src/04kernel/src/kernels/conv/cudnn_kernel.cc
index 82848483a..88d293f57 100644
--- a/src/04kernel/src/kernels/conv/cudnn_kernel.cc
+++ b/src/04kernel/src/kernels/conv/cudnn_kernel.cc
@@ -19,10 +19,17 @@ namespace refactor::kernel {
                   Tensor const &w,
                   std::optional<std::reference_wrapper<Tensor const>> b,
                   Tensor const &y) -> KernelBox {
+        static const std::unordered_set<decltype(DataType::internal)>
+            SET{DataType::FP16, DataType::BF16, DataType::F32, DataType::F64, DataType::I8};
 #ifndef USE_CUDA
         return nullptr;
 #endif
 
+        auto dt = x.dataType;
+        if (!SET.contains(dt) || w.dataType != dt || y.dataType != dt) {
+            return nullptr;
+        }
+
         std::optional<ExpandInfo> biasExpand = std::nullopt;
         if (b) {
             ASSERT(b->get().shape[0] == y.shape[1], "");
@@ -42,7 +49,7 @@ namespace refactor::kernel {
              p = poolAttributes.pads(),
              s = poolAttributes.strides();
         return std::make_unique<K>(decltype(info){
-            x.dataType,
+            dt,
             {
                 static_cast<int>(x.shape[0]),
                 static_cast<int>(x.shape[1]),
@@ -134,13 +141,18 @@ namespace refactor::kernel {
         auto pp = info.pad;
         auto ss = info.stride;
         auto dd = info.dilation;
+        // clang-format off
+        auto computation = info.dt == DataType::F64 ? DataType::F64
+                         : info.dt == DataType::I8  ? DataType::I32
+                         : DataType::F32;
+        // clang-format on
         CUDNN_ASSERT(cudnnSetConvolution2dDescriptor(
             d->conv,
             std::min(pp[0], pp[2]), std::min(pp[1], pp[3]),
             ss[0], ss[1],
             dd[0], dd[1],
             CUDNN_CROSS_CORRELATION,
-            cudnnDataTypeConvert(d->f32 ? DataType::F32 : DataType::F64)));
+            cudnnDataTypeConvert(computation)));
 
         if (auto group = xs[1] / ws[1]; group > 1) {
             CUDNN_ASSERT(cudnnSetConvolutionGroupCount(d->conv, group));
diff --git a/src/04kernel/src/kernels/dequantize_linear/cpu_kernel.cc b/src/04kernel/src/kernels/dequantize_linear/cpu_kernel.cc
new file mode 100644
index 000000000..7a53f533b
--- /dev/null
+++ b/src/04kernel/src/kernels/dequantize_linear/cpu_kernel.cc
@@ -0,0 +1,76 @@
+#include "cpu_kernel.hh"
+#include <execution>
+#include <numeric>
+
+namespace refactor::kernel {
+    using K = DequantizeLinearCpu;
+
+    K::DequantizeLinearCpu(
+        decltype(from) from_,
+        decltype(size) size_,
+        decltype(withZeroPoint) withZeroPoint_) noexcept
+        : Kernel(),
+          from(from_),
+          size(size_),
+          withZeroPoint(withZeroPoint_) {}
+
+    auto K::build(TensorRefs const &inputs, Tensor const &output) noexcept -> KernelBox {
+        if (inputs[1].get().elementsSize() != 1) {
+            return nullptr;
+        }
+        if (output.dataType != DataType::F32) {
+            return nullptr;
+        }
+        return std::make_unique<K>(
+            inputs[0].get().dataType,
+            inputs[0].get().elementsSize(),
+            inputs.size() > 2);
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing dequantize linear using CPU";
+    }
+
+    template<class TI, class TO>
+    auto lowerTyped(size_t size, bool withZeroPoint) noexcept -> RoutineWorkspace {
+
+        return [size, withZeroPoint]//
+            (Resources &, void *, void const *const *inputs, void *const *outputs) {
+                auto x = reinterpret_cast<TI const *>(inputs[0]);
+                auto scale = *reinterpret_cast<TO const *>(inputs[1]);
+                auto zp = withZeroPoint ? *reinterpret_cast<TI const *>(inputs[2]) : 0;
+                auto y = reinterpret_cast<TO *>(outputs[0]);
+                std::transform(
+                    std::execution::par_unseq,
+                    x, x + size,
+                    y,
+                    [scale, zp](TI x) {
+                        return static_cast<TO>(x - zp) * scale;
+                    });
+            };
+    }
+
+    auto K::lower(Resources &) const noexcept -> RoutineWorkspace {
+        switch (from) {
+            case DataType::U8:
+                return lowerTyped<uint8_t, float>(size, withZeroPoint);
+            case DataType::U16:
+                return lowerTyped<uint16_t, float>(size, withZeroPoint);
+            case DataType::I8:
+                return lowerTyped<int8_t, float>(size, withZeroPoint);
+            case DataType::I16:
+                return lowerTyped<int16_t, float>(size, withZeroPoint);
+            case DataType::I32:
+                return lowerTyped<int32_t, float>(size, withZeroPoint);
+            default:
+                UNREACHABLE();
+        }
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/dequantize_linear/cpu_kernel.hh b/src/04kernel/src/kernels/dequantize_linear/cpu_kernel.hh
new file mode 100644
index 000000000..977692fd2
--- /dev/null
+++ b/src/04kernel/src/kernels/dequantize_linear/cpu_kernel.hh
@@ -0,0 +1,29 @@
+#ifndef KERNEL_DEQUANTIZE_LINEAR_CPU_KERNEL_HH
+#define KERNEL_DEQUANTIZE_LINEAR_CPU_KERNEL_HH
+
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    struct DequantizeLinearCpu final : public Kernel {
+        DataType from;
+        size_t size;
+        bool withZeroPoint;
+
+        DequantizeLinearCpu(
+            decltype(from),
+            decltype(size),
+            decltype(withZeroPoint)) noexcept;
+
+        static KernelBox build(TensorRefs const &, Tensor const &) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+        RoutineWorkspace lower(Resources &) const noexcept final;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_DEQUANTIZE_LINEAR_CPU_KERNEL_HH
diff --git a/src/04kernel/src/kernels/dequantize_linear/cuda_kernel.cc b/src/04kernel/src/kernels/dequantize_linear/cuda_kernel.cc
new file mode 100644
index 000000000..1cdc1c45b
--- /dev/null
+++ b/src/04kernel/src/kernels/dequantize_linear/cuda_kernel.cc
@@ -0,0 +1,92 @@
+#include "cuda_kernel.hh"
+
+#ifdef USE_CUDA
+#include "../../generator/nvrtc_repo.h"
+#include "kernel/cuda/threads_distributer.cuh"
+#endif
+
+namespace refactor::kernel {
+    using K = DequantizeLinearCuda;
+
+    K::DequantizeLinearCuda(
+        decltype(from) from_,
+        decltype(to) to_,
+        decltype(size) size_,
+        decltype(withZeroPoint) withZeroPoint_) noexcept
+        : Kernel(),
+          from(from_),
+          to(to_),
+          size(size_),
+          withZeroPoint(withZeroPoint_) {}
+
+    auto K::build(TensorRefs const &inputs, Tensor const &output) noexcept -> KernelBox {
+#ifndef USE_CUDA
+        return nullptr;
+#endif
+
+        auto const &x = inputs[0].get();
+        if (inputs[1].get().elementsSize() != 1) {
+            return nullptr;
+        }
+        return std::make_unique<K>(
+            x.dataType,
+            output.dataType,
+            x.elementsSize(),
+            inputs.size() > 2);
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing dequantize linear using Nvidia GPU";
+    }
+
+#ifdef USE_CUDA
+
+    constexpr static const char *TEMPLATE = R"~(
+extern "C" __global__ void kernel(
+    {0:}       *__restrict__ y,
+    {1:} const *__restrict__ x,
+    {0:} const *__restrict__ scale_,
+    {1:} const *__restrict__ zp_,
+    size_t n
+) {{
+    auto zp = zp_ ? *zp_ : static_cast<{1:}>(0);
+    auto scale = *scale_;
+    for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
+              step = blockDim.x * gridDim.x;
+         tid < n;
+         tid += step) {{
+        y[tid] = static_cast<{0:}>(x[tid] - zp) * scale;
+    }}
+}}
+)~";
+
+    auto K::lower(Resources &res) const -> RoutineWorkspace {
+        using namespace runtime;
+
+        auto name = fmt::format("DequantizeLinear{}->{}", from.name(), to.name());
+        auto code = fmt::format(TEMPLATE, nvrtc::dataType(to), nvrtc::dataType(from));
+        return [withZeroPoint = withZeroPoint,
+                params = cuda::ThreadsDistributer()(size),
+                h = nvrtc::Handler::compile(name.c_str(), code.c_str(), "kernel")]//
+            (Resources &, void *, void const *const *inputs, void *const *outputs) {
+                auto y = outputs[0];
+                auto x = inputs[0],
+                     scale = inputs[1],
+                     zp = withZeroPoint ? inputs[2] : nullptr;
+                auto n = params.n;
+                void *args[]{&y, &x, &scale, &zp, &n};
+                h->launch(params.gridSize, 1, 1,
+                          params.blockSize, 1, 1,
+                          0, args);
+            };
+    }
+
+#endif
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/dequantize_linear/cuda_kernel.hh b/src/04kernel/src/kernels/dequantize_linear/cuda_kernel.hh
new file mode 100644
index 000000000..143552fd0
--- /dev/null
+++ b/src/04kernel/src/kernels/dequantize_linear/cuda_kernel.hh
@@ -0,0 +1,32 @@
+#ifndef KERNEL_DEQUANTIZE_LINEAR_CUDA_KERNEL_HH
+#define KERNEL_DEQUANTIZE_LINEAR_CUDA_KERNEL_HH
+
+#include "kernel/kernel.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    struct DequantizeLinearCuda final : public Kernel {
+        DataType from, to;
+        size_t size;
+        bool withZeroPoint;
+
+        DequantizeLinearCuda(
+            decltype(from),
+            decltype(to),
+            decltype(size),
+            decltype(withZeroPoint)) noexcept;
+
+        static KernelBox build(TensorRefs const &, Tensor const &) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_CUDA
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_DEQUANTIZE_LINEAR_CUDA_KERNEL_HH
diff --git a/src/04kernel/src/kernels/dynamic_quantize_linear/cpu_kernel.cc b/src/04kernel/src/kernels/dynamic_quantize_linear/cpu_kernel.cc
new file mode 100644
index 000000000..cc52f6bd4
--- /dev/null
+++ b/src/04kernel/src/kernels/dynamic_quantize_linear/cpu_kernel.cc
@@ -0,0 +1,72 @@
+#include "cpu_kernel.hh"
+#include <execution>
+#include <numeric>
+
+namespace refactor::kernel {
+    using K = DynamicQuantizeLinearCpu;
+
+    K::DynamicQuantizeLinearCpu(decltype(size) size_) noexcept
+        : Kernel(), size(size_) {}
+
+    auto K::build(decltype(size) size) noexcept -> KernelBox {
+        return std::make_unique<K>(size);
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing dynamic quantize linear using CPU";
+    }
+
+    template<class TO, class TI>
+    static TO saturate(TI x) {
+        constexpr static auto
+            QMIN = static_cast<TI>(std::numeric_limits<TO>::min()),
+            QMAX = static_cast<TI>(std::numeric_limits<TO>::max());
+        return static_cast<TO>(std::round(std::clamp(x, QMIN, QMAX)));
+    }
+
+    auto K::lower(Resources &) const noexcept -> RoutineWorkspace {
+        using namespace runtime;
+        return [size = size](Resources &, void *, void const *const *inputs, void *const *outputs) {
+            using TI = float;
+            using TO = uint8_t;
+
+            constexpr static auto
+                ZERO = static_cast<TI>(0),
+                _MIN = std::numeric_limits<TI>::min(),
+                _MAX = std::numeric_limits<TI>::max(),
+                QMIN = static_cast<TI>(std::numeric_limits<TO>::min()),
+                QMAX = static_cast<TI>(std::numeric_limits<TO>::max()),
+                QLEN = QMAX - QMIN;
+
+            auto x = reinterpret_cast<TI const *>(inputs[0]);
+            auto [min, max] = std::accumulate(
+                x, x + size,
+                std::pair{_MAX, _MIN},
+                [](auto acc, auto it) {
+                    auto [min, max] = acc;
+                    return std::pair{
+                        std::min(min, it),
+                        std::max(max, it),
+                    };
+                });
+            auto len = std::max(ZERO, max) - std::min(ZERO, min);
+            auto scale = len / QLEN;
+            auto zp = saturate<TO>(QMIN - min * QLEN / len);
+
+            std::transform(
+                std::execution::par_unseq,
+                x, x + size,
+                reinterpret_cast<TO *>(outputs[0]),
+                [=](auto it) { return saturate<TO>(std::round(it / scale) + zp); });
+            *reinterpret_cast<TI *>(outputs[1]) = scale;
+            *reinterpret_cast<TO *>(outputs[2]) = zp;
+        };
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/dynamic_quantize_linear/cpu_kernel.hh b/src/04kernel/src/kernels/dynamic_quantize_linear/cpu_kernel.hh
new file mode 100644
index 000000000..1883179b0
--- /dev/null
+++ b/src/04kernel/src/kernels/dynamic_quantize_linear/cpu_kernel.hh
@@ -0,0 +1,23 @@
+#ifndef KERNEL_DYNAMIC_QUANTIZE_LINEAR_CPU_KERNEL_HH
+#define KERNEL_DYNAMIC_QUANTIZE_LINEAR_CPU_KERNEL_HH
+
+#include "kernel/kernel.h"
+
+namespace refactor::kernel {
+
+    struct DynamicQuantizeLinearCpu final : public Kernel {
+        size_t size;
+
+        explicit DynamicQuantizeLinearCpu(decltype(size)) noexcept;
+
+        static KernelBox build(decltype(size)) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+        RoutineWorkspace lower(Resources &) const noexcept final;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_DYNAMIC_QUANTIZE_LINEAR_CPU_KERNEL_HH
diff --git a/src/04kernel/src/kernels/dynamic_quantize_linear/cuda_kernel.cc b/src/04kernel/src/kernels/dynamic_quantize_linear/cuda_kernel.cc
new file mode 100644
index 000000000..3f26397fe
--- /dev/null
+++ b/src/04kernel/src/kernels/dynamic_quantize_linear/cuda_kernel.cc
@@ -0,0 +1,23 @@
+#include "cuda_kernel.hh"
+
+namespace refactor::kernel {
+    using K = DynamicQuantizeLinearCuda;
+
+    K::DynamicQuantizeLinearCuda(decltype(size) size_) noexcept
+        : Kernel(), size(size_) {}
+
+    auto K::build(decltype(size) size) noexcept -> KernelBox {
+        return std::make_unique<K>(size);
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing dynamic quantize linear using Nvidia GPU";
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/dynamic_quantize_linear/cuda_kernel.cu b/src/04kernel/src/kernels/dynamic_quantize_linear/cuda_kernel.cu
new file mode 100644
index 000000000..71b2ce31e
--- /dev/null
+++ b/src/04kernel/src/kernels/dynamic_quantize_linear/cuda_kernel.cu
@@ -0,0 +1,138 @@
+#include "cuda_kernel.hh"
+#include "hardware/functions.h"
+#include "kernel/cuda/threads_distributer.cuh"
+#include <cub/cub.cuh>
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
+namespace refactor::kernel {
+    using K = DynamicQuantizeLinearCuda;
+
+    template<class T>
+    struct QuantizeMinMax {
+        T min, max;
+    };
+
+    template<class T>
+    struct QuantizeMapMinMaxFunctor {
+        __device__ __forceinline__ QuantizeMinMax<T>
+        operator()(T x) const noexcept {
+            return {x, x};
+        }
+    };
+
+    template<class T>
+    struct QuantizeReduceMinMaxFunctor {
+        __device__ __forceinline__ QuantizeMinMax<T>
+        operator()(QuantizeMinMax<T> a, QuantizeMinMax<T> b) const noexcept {
+            return {CUB_MIN(a.min, b.min), CUB_MAX(a.max, b.max)};
+        }
+    };
+
+    template<class T>
+    constexpr static auto
+        ZERO = static_cast<T>(0);
+
+    template<class TI, class TO>
+    constexpr static auto
+        QMIN = static_cast<TI>(std::numeric_limits<TO>::min());
+
+    template<class TI, class TO>
+    constexpr static auto
+        QMAX = static_cast<TI>(std::numeric_limits<TO>::max());
+
+    template<class TI, class TO>
+    constexpr static auto
+        QLEN = QMAX<TI, TO> - QMIN<TI, TO>;
+
+    template<class TI, class TO>
+    __global__ static void kernel(
+        size_t n,
+        QuantizeMinMax<TI> const *__restrict__ minmax,
+        TI const *__restrict__ x,
+        TO *__restrict__ y,
+        TI *__restrict__ scale_,
+        TO *__restrict__ zp_) {
+
+        auto const [min, max] = *minmax;
+        auto cover0 = QuantizeReduceMinMaxFunctor<TI>{}({min, max}, {ZERO<TI>, ZERO<TI>});
+        auto scale = (cover0.max - cover0.min) / QLEN<TI, TO>;
+        auto zp = static_cast<TO>(round(QMIN<TI, TO> - min / scale));
+
+        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+        for (auto step = blockDim.x * gridDim.x,
+                  i = tid;
+             i < n;
+             i += step) {
+            y[i] = static_cast<TO>(std::round(x[i] / scale) + zp);
+        }
+        if (tid == 0) {
+            *scale_ = scale;
+            *zp_ = zp;
+        }
+    }
+
+    auto K::lower(Resources &) const -> RoutineWorkspace {
+        using namespace runtime;
+        using TI = float;
+        using TO = uint8_t;
+
+        constexpr static auto
+            _MIN = std::numeric_limits<TI>::min(),
+            _MAX = std::numeric_limits<TI>::max();
+
+        auto workspaceSize = hardware::alignBytes(size * sizeof(QuantizeMinMax<TI>), 256);
+
+        QuantizeMinMax<TI> *nullTyped = nullptr;
+        size_t tempStorageBytes = 0;
+        auto e = cub::DeviceReduce::Reduce(
+            nullptr, tempStorageBytes,
+            nullTyped, nullTyped, size,
+            QuantizeReduceMinMaxFunctor<TI>{},
+            QuantizeMinMax<TI>{});
+        if (e != cudaSuccess) {
+            RUNTIME_ERROR(fmt::format("error: {} {}", (int) e, cudaGetErrorString(e)));
+        }
+
+        auto offset0 = workspaceSize;
+        workspaceSize += tempStorageBytes;
+        workspaceSize = hardware::alignBytes(workspaceSize, 256);
+
+        auto offset1 = workspaceSize;
+        workspaceSize += sizeof(QuantizeMinMax<TI>);
+
+        auto routine = [offset0, tempStorageBytes, offset1,
+                        params = cuda::ThreadsDistributer()(size)]//
+            (Resources &, void *workspacePtr, void const *const *inputs, void *const *outputs) {
+                auto x = reinterpret_cast<TI const *>(inputs[0]);
+                auto y = reinterpret_cast<TO *>(outputs[0]);
+                auto scale = reinterpret_cast<TI *>(outputs[1]);
+                auto zp = reinterpret_cast<TO *>(outputs[2]);
+                auto workspace = reinterpret_cast<uint8_t *>(workspacePtr);
+                auto doubled = reinterpret_cast<QuantizeMinMax<TI> *>(workspace);
+                auto tempStorage = workspace + offset0;
+                auto minmax = reinterpret_cast<QuantizeMinMax<TI> *>(workspace + offset1);
+
+                thrust::transform(
+                    thrust::device,
+                    x, x + params.n, doubled,
+                    QuantizeMapMinMaxFunctor<TI>{});
+
+                auto tempStorageSize_ = tempStorageBytes;
+                auto e = cub::DeviceReduce::Reduce(
+                    tempStorage, tempStorageSize_,
+                    doubled, minmax, params.n,
+                    QuantizeReduceMinMaxFunctor<TI>{},
+                    QuantizeMinMax<TI>{_MAX, _MIN});
+                if (e != cudaSuccess) {
+                    RUNTIME_ERROR(fmt::format("error: {} {}", (int) e, cudaGetErrorString(e)));
+                }
+
+                kernel<<<params.gridSize, params.blockSize>>>(
+                    params.n, minmax, x, y, scale, zp);
+            };
+
+        return {std::move(routine), workspaceSize};
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/dynamic_quantize_linear/cuda_kernel.hh b/src/04kernel/src/kernels/dynamic_quantize_linear/cuda_kernel.hh
new file mode 100644
index 000000000..d027d751e
--- /dev/null
+++ b/src/04kernel/src/kernels/dynamic_quantize_linear/cuda_kernel.hh
@@ -0,0 +1,25 @@
+#ifndef KERNEL_DYNAMIC_QUANTIZE_LINEAR_CUDA_KERNEL_HH
+#define KERNEL_DYNAMIC_QUANTIZE_LINEAR_CUDA_KERNEL_HH
+
+#include "kernel/kernel.h"
+
+namespace refactor::kernel {
+
+    struct DynamicQuantizeLinearCuda final : public Kernel {
+        size_t size;
+
+        explicit DynamicQuantizeLinearCuda(decltype(size)) noexcept;
+
+        static KernelBox build(decltype(size)) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_CUDA
+        RoutineWorkspace lower(Resources &) const final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_DYNAMIC_QUANTIZE_LINEAR_CUDA_KERNEL_HH
diff --git a/src/04kernel/src/kernels/mat_mul/cpu_kernel.cc b/src/04kernel/src/kernels/mat_mul/cpu_kernel.cc
index e2c448d43..bcd184c3f 100644
--- a/src/04kernel/src/kernels/mat_mul/cpu_kernel.cc
+++ b/src/04kernel/src/kernels/mat_mul/cpu_kernel.cc
@@ -1,5 +1,6 @@
 #include "cpu_kernel.hh"
 #include "../expand/cpu_kernel.hh"
+#include "../mat_mul_common/cpu_template.hpp"
 
 namespace refactor::kernel {
     using K = MatMulCPU;
@@ -8,12 +9,10 @@ namespace refactor::kernel {
     K::MatMulCPU(decltype(info) info_) noexcept
         : Kernel(), info(std::move(info_)) {}
 
-    auto K::build(MatMulInfo info) noexcept -> KernelBox {
-        if (!info.dataType.isCpuNumberic()) {
-            return nullptr;
-        }
-
-        return std::make_unique<K>(std::move(info));
+    auto K::build(decltype(info) info) noexcept -> KernelBox {
+        return info.dataType.isCpuNumberic()
+                   ? std::make_unique<K>(std::move(info))
+                   : nullptr;
     }
 
     auto K::typeId() noexcept -> size_t {
@@ -26,97 +25,66 @@ namespace refactor::kernel {
         return "Performing MatMul using CPU";
     }
 
-    struct MatMulCPUMetaData {
-        size_t M, K, N;
-        size_t strideA0, strideA1, strideB0, strideB1;
-    };
+    template<class T>
+    static auto lowerTyped(MatMulInfo const &info, Resources &res) noexcept -> RoutineWorkspace {
+
+        auto biasEx = info.biasExpand
+                          ? std::make_optional(ExpandCpu(*info.biasExpand).lower(res).routine)
+                          : std::nullopt;
+
+        return [info = info, biasEx](runtime::Resources &res, void *, void const *const *inputs, void *const *outputs) {
+            if (biasEx) { (*biasEx)(res, nullptr, inputs + 2, outputs); }
 
-    /*
-    * 2D matrix multiplication: Y = a * A @ B + b * Y
-    * Assume bias C has been broadcast to Y already. Beta should be 0 in the absence of bias.
-    */
-    template<typename T>
-    void matrixMultiply(T const *A, T const *B, T *Y,
-                        T const alpha, T const beta,
-                        const MatMulCPUMetaData md) {
-        // #pragma omp parallel for
-        for (size_t i = 0; i < md.M; i++) {
-            for (size_t j = 0; j < md.N; j++) {
-                T sum = 0;
-                // #pragma omp simd reduction(+ : sum)
-                for (size_t k = 0; k < md.K; k++) {
-                    sum += A[i * md.strideA0 + k * md.strideA1] * B[k * md.strideB0 + j * md.strideB1];
+            MatMulCPUMetaData const md{
+                .M = info.m,
+                .K = info.k,
+                .N = info.n,
+                .strideA0 = info.transA ? 1 : info.k,
+                .strideA1 = info.transA ? info.m : 1,
+                .strideB0 = info.transB ? 1 : info.n,
+                .strideB1 = info.transB ? info.k : 1,
+                .alpha = static_cast<T>(info.alpha),
+                .beta = static_cast<T>(info.biasExpand ? info.beta : 0.0f),
+            };
+            auto const stepY = info.m * info.n,
+                       stepA = info.m * info.k,
+                       stepB = info.k * info.n;
+
+            auto a = reinterpret_cast<T const *>(inputs[0]);
+            auto b = reinterpret_cast<T const *>(inputs[1]);
+            auto y = reinterpret_cast<T *>(outputs[0]);
+            if (info.broadcaster.needBroadcast()) {
+                dim_t offset[2];
+                for (auto i : range0_(info.broadcaster.outputsCount)) {
+                    info.broadcaster.locate(i, offset);
+                    md.matrixMultiply(a + stepA * offset[0], b + stepB * offset[1], y + stepY * i);
+                }
+            } else {
+                for (auto i : range0_(info.broadcaster.outputsCount)) {
+                    md.matrixMultiply(a + stepA * i, b + stepB * i, y + stepY * i);
                 }
-                Y[i * md.N + j] = beta * Y[i * md.N + j] + alpha * sum;
             }
-        }
-    }
-
-#define CASE(T)                                                                                                                               \
-    case DT::T: {                                                                                                                             \
-        using T_ = primitive<DT::T>::type;                                                                                                    \
-        if (std::holds_alternative<Broadcaster>(info.broadcasterOrBatch)) {                                                                   \
-            return [alpha = static_cast<T_>(info.alpha),                                                                                      \
-                    beta = static_cast<T_>(info.biasExpand ? info.beta : 0.0f),                                                               \
-                    broadcaster = std::get<Broadcaster>(info.broadcasterOrBatch),                                                             \
-                    md,                                                                                                                       \
-                    stepY = info.m * info.n,                                                                                                  \
-                    stepA = info.m * info.k,                                                                                                  \
-                    stepB = info.k * info.n,                                                                                                  \
-                    biasEx = info.biasExpand                                                                                                  \
-                                 ? std::make_optional(ExpandCpu(*info.biasExpand).lower(res).routine)                                         \
-                                 : std::nullopt](runtime::Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { \
-                if (biasEx) { (*biasEx)(res, workspace, inputs + 2, outputs); }                                                               \
-                auto A = reinterpret_cast<T_ const *>(inputs[0]);                                                                             \
-                auto B = reinterpret_cast<T_ const *>(inputs[1]);                                                                             \
-                auto Y = reinterpret_cast<T_ *>(outputs[0]);                                                                                  \
-                dim_t offset[2];                                                                                                              \
-                for (size_t i = 0; i < broadcaster.outputsCount; i++) {                                                                       \
-                    broadcaster.locate(i, offset);                                                                                            \
-                    matrixMultiply(A + stepA * offset[0], B + stepB * offset[1], Y + stepY * i, alpha, beta, md);                             \
-                }                                                                                                                             \
-            };                                                                                                                                \
-        } else {                                                                                                                              \
-            return [alpha = static_cast<T_>(info.alpha),                                                                                      \
-                    beta = static_cast<T_>(info.biasExpand ? info.beta : 0.0f),                                                               \
-                    batch = std::get<size_t>(info.broadcasterOrBatch),                                                                        \
-                    md,                                                                                                                       \
-                    stepY = info.m * info.n,                                                                                                  \
-                    stepA = info.m * info.k,                                                                                                  \
-                    stepB = info.k * info.n,                                                                                                  \
-                    biasEx = info.biasExpand                                                                                                  \
-                                 ? std::make_optional(ExpandCpu(*info.biasExpand).lower(res).routine)                                         \
-                                 : std::nullopt](runtime::Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { \
-                if (biasEx) { (*biasEx)(res, workspace, inputs + 2, outputs); }                                                               \
-                auto A = reinterpret_cast<T_ const *>(inputs[0]);                                                                             \
-                auto B = reinterpret_cast<T_ const *>(inputs[1]);                                                                             \
-                auto Y = reinterpret_cast<T_ *>(outputs[0]);                                                                                  \
-                for (size_t i = 0; i < batch; i++) {                                                                                          \
-                    matrixMultiply(A + stepA * i, B + stepB * i, Y + stepY * i, alpha, beta, md);                                             \
-                }                                                                                                                             \
-            };                                                                                                                                \
-        }                                                                                                                                     \
+        };
     }
 
     auto K::lower(Resources &res) const noexcept -> RoutineWorkspace {
-        MatMulCPUMetaData md;
-        md.M = info.m, md.K = info.k, md.N = info.n;
-        md.strideA0 = info.transA ? 1 : info.k;
-        md.strideA1 = info.transA ? info.m : 1;
-        md.strideB0 = info.transB ? 1 : info.n;
-        md.strideB1 = info.transB ? info.k : 1;
+#define CASE(T)       \
+    case DataType::T: \
+        return lowerTyped<primitive<DataType::T>::type>(info, res);
 
         switch (info.dataType) {
             CASE(F32);
+            CASE(F64);
+
             CASE(U8);
-            CASE(I8);
             CASE(U16);
+            CASE(U32);
+            CASE(U64);
+
+            CASE(I8);
             CASE(I16);
             CASE(I32);
             CASE(I64);
-            CASE(F64);
-            CASE(U32);
-            CASE(U64);
             default:
                 UNREACHABLE();
         }
diff --git a/src/04kernel/src/kernels/mat_mul/cpu_kernel.hh b/src/04kernel/src/kernels/mat_mul/cpu_kernel.hh
index ca2ce140b..950cb7e24 100644
--- a/src/04kernel/src/kernels/mat_mul/cpu_kernel.hh
+++ b/src/04kernel/src/kernels/mat_mul/cpu_kernel.hh
@@ -1,18 +1,17 @@
 #ifndef KERNEL_MATMUL_CPU_KERNEL_HH
 #define KERNEL_MATMUL_CPU_KERNEL_HH
 
-#include "kernel/attributes/matmul_info.h"
+#include "kernel/attributes/mat_mul_info.h"
 #include "kernel/kernel.h"
-#include "kernel/tensor.h"
 
 namespace refactor::kernel {
 
     struct MatMulCPU final : public Kernel {
         MatMulInfo info;
 
-        explicit MatMulCPU(MatMulInfo) noexcept;
+        explicit MatMulCPU(decltype(info)) noexcept;
 
-        static KernelBox build(MatMulInfo) noexcept;
+        static KernelBox build(decltype(info)) noexcept;
         static size_t typeId() noexcept;
 
         size_t kernelTypeId() const noexcept final;
diff --git a/src/04kernel/src/kernels/mat_mul/cublas_kernel.cc b/src/04kernel/src/kernels/mat_mul/cublas_kernel.cc
index 7a69a192c..5f97d56ba 100644
--- a/src/04kernel/src/kernels/mat_mul/cublas_kernel.cc
+++ b/src/04kernel/src/kernels/mat_mul/cublas_kernel.cc
@@ -7,13 +7,12 @@ namespace refactor::kernel {
     K::MatMulCublas(decltype(info) info_) noexcept
         : Kernel(), info(std::move(info_)) {}
 
-    auto K::build(MatMulInfo info) noexcept -> KernelBox {
-        static const std::unordered_set<decltype(DT::internal)> TYPE{DT::F32, DT::F64, DT::FP16};
+    auto K::build(decltype(info) info) noexcept -> KernelBox {
 #ifndef USE_CUDA
         return nullptr;
 #endif
 
-        return TYPE.contains(info.dataType)
+        return info.dataType.isIeee754() || info.dataType == DT::I8
                    ? std::make_unique<K>(std::move(info))
                    : nullptr;
     }
diff --git a/src/04kernel/src/kernels/mat_mul/cublas_kernel.cu b/src/04kernel/src/kernels/mat_mul/cublas_kernel.cu
index 8e32bad24..b0c2a7a0b 100644
--- a/src/04kernel/src/kernels/mat_mul/cublas_kernel.cu
+++ b/src/04kernel/src/kernels/mat_mul/cublas_kernel.cu
@@ -1,7 +1,6 @@
 ﻿#include "../../utilities/cuda/cublas_context.hh"
 #include "../expand/cuda_kernel.hh"
 #include "cublas_kernel.hh"
-#include <cublas_v2.h>
 
 namespace refactor::kernel {
     using namespace runtime;
@@ -19,43 +18,46 @@ namespace refactor::kernel {
         auto m       = info.m,
              n       = info.n,
              k       = info.k;
-        auto strideY = info.m * info.n,
-             strideA = info.m * info.k,
-             strideB = info.k * info.n;
-        auto lda     = info.transA ? info.m : info.k,
-             ldb     = info.transB ? info.k : info.n;
+        auto strideY = m * n,
+             strideA = m * k,
+             strideB = k * n;
+        auto lda     = info.transA ? m : k,
+             ldb     = info.transB ? k : n;
         auto biasEx  = info.biasExpand
                      ? std::make_optional(ExpandCuda(*info.biasExpand).lower(res).routine)
                      : std::nullopt;
         // clang-format on
-        if (std::holds_alternative<size_t>(info.broadcasterOrBatch)) {
-            return [batch = std::get<size_t>(info.broadcasterOrBatch),
+        if (info.broadcaster.needBroadcast()) {
+            return [broadcaster = info.broadcaster,
                     cudaDataType,
                     alpha, beta, tA, tB,
                     m, n, k,
-                    strideA, strideB,
+                    strideY, strideA, strideB,
                     lda, ldb,
                     biasEx]//
                 (Resources & res, void *workspace, void const *const *inputs, void *const *outputs) {
-                    // Call expand kernel to broadcast bias if bias is used
                     if (biasEx) { (*biasEx)(res, workspace, inputs + 2, outputs); }
 
+                    auto handle = res.fetchOrStore<CublasContext>()->handle;
                     auto a = reinterpret_cast<T const *>(inputs[0]);
                     auto b = reinterpret_cast<T const *>(inputs[1]);
                     auto y = reinterpret_cast<T *>(outputs[0]);
-                    cublasGemmStridedBatchedEx(
-                        res.fetchOrStore<CublasContext>()->handle,
-                        tB, tA,
-                        n, m, k,
-                        &alpha,
-                        b, cudaDataType, ldb, strideB,
-                        a, cudaDataType, lda, strideA,
-                        &beta, y, cudaDataType,
-                        n, m * n, batch, cudaDataType,
-                        CUBLAS_GEMM_DEFAULT);
+                    uint32_t offset[2];
+                    for (auto i : range0_(broadcaster.outputsCount)) {
+                        broadcaster.locate(i, offset);
+                        CUBLAS_ASSERT(cublasGemmEx(
+                            handle,
+                            tB, tA, n, m, k,
+                            &alpha,
+                            b + strideB * offset[1], cudaDataType, ldb,
+                            a + strideA * offset[0], cudaDataType, lda,
+                            &beta, y + strideY * i, cudaDataType, n,
+                            cudaDataType, CUBLAS_GEMM_DEFAULT));
+                    }
                 };
-        } else {//  if use boradcaster
-            return [broadcaster = std::get<Broadcaster>(info.broadcasterOrBatch),
+
+        } else {
+            return [batch = info.broadcaster.outputsCount,
                     cudaDataType,
                     alpha, beta, tA, tB,
                     m, n, k,
@@ -63,25 +65,22 @@ namespace refactor::kernel {
                     lda, ldb,
                     biasEx]//
                 (Resources & res, void *workspace, void const *const *inputs, void *const *outputs) {
+                    // Call expand kernel to broadcast bias if bias is used
                     if (biasEx) { (*biasEx)(res, workspace, inputs + 2, outputs); }
 
+                    auto handle = res.fetchOrStore<CublasContext>()->handle;
                     auto a = reinterpret_cast<T const *>(inputs[0]);
                     auto b = reinterpret_cast<T const *>(inputs[1]);
                     auto y = reinterpret_cast<T *>(outputs[0]);
-                    uint32_t offset[2];
-                    for (auto i : range0_(broadcaster.outputsCount)) {
-                        broadcaster.locate(i, offset);
-                        cublasGemmEx(
-                            res.fetchOrStore<CublasContext>()->handle,
-                            tB, tA,
-                            n, m, k,
-                            &alpha,
-                            b + strideB * offset[1], cudaDataType, ldb,
-                            a + strideA * offset[0], cudaDataType, lda,
-                            &beta, y + strideY * i, cudaDataType,
-                            n, cudaDataType,
-                            CUBLAS_GEMM_DEFAULT);
-                    }
+                    CUBLAS_ASSERT(cublasGemmStridedBatchedEx(
+                        handle,
+                        tB, tA, n, m, k,
+                        &alpha,
+                        b, cudaDataType, ldb, strideB,
+                        a, cudaDataType, lda, strideA,
+                        &beta, y, cudaDataType, n,
+                        strideY, batch,
+                        cudaDataType, CUBLAS_GEMM_DEFAULT));
                 };
         }
     }
diff --git a/src/04kernel/src/kernels/mat_mul/cublas_kernel.hh b/src/04kernel/src/kernels/mat_mul/cublas_kernel.hh
index 01a511527..9d49aa805 100644
--- a/src/04kernel/src/kernels/mat_mul/cublas_kernel.hh
+++ b/src/04kernel/src/kernels/mat_mul/cublas_kernel.hh
@@ -1,7 +1,7 @@
 ﻿#ifndef KERNEL_MATMUL_CUBLAS_KERNEL_HH
 #define KERNEL_MATMUL_CUBLAS_KERNEL_HH
 
-#include "kernel/attributes/matmul_info.h"
+#include "kernel/attributes/mat_mul_info.h"
 #include "kernel/kernel.h"
 
 namespace refactor::kernel {
@@ -9,9 +9,9 @@ namespace refactor::kernel {
     struct MatMulCublas final : public Kernel {
         MatMulInfo info;
 
-        explicit MatMulCublas(MatMulInfo) noexcept;
+        explicit MatMulCublas(decltype(info)) noexcept;
 
-        static KernelBox build(MatMulInfo) noexcept;
+        static KernelBox build(decltype(info)) noexcept;
         static size_t typeId() noexcept;
 
         size_t kernelTypeId() const noexcept final;
diff --git a/src/04kernel/src/kernels/mat_mul_common/cpu_template.hpp b/src/04kernel/src/kernels/mat_mul_common/cpu_template.hpp
new file mode 100644
index 000000000..766279b3f
--- /dev/null
+++ b/src/04kernel/src/kernels/mat_mul_common/cpu_template.hpp
@@ -0,0 +1,35 @@
+﻿#ifndef KERNEL_MATMUL_COMMON_CPU_TEMPLATE_HPP
+#define KERNEL_MATMUL_COMMON_CPU_TEMPLATE_HPP
+
+namespace refactor::kernel {
+
+    template<class TO, class TI>
+    struct MatMulCPUMetaData {
+        size_t M, K, N,
+            strideA0, strideA1,
+            strideB0, strideB1;
+        TI alpha;
+        TO beta;
+
+        /*
+         * 2D matrix multiplication: Y = a * A @ B + b * Y
+         * Assume bias C has been broadcast to Y already. Beta should be 0 in the absence of bias.
+         */
+        void matrixMultiply(TI const *a, TI const *b, TO *y) const noexcept {
+            // #pragma omp parallel for
+            for (size_t i = 0; i < M; i++) {
+                for (size_t j = 0; j < N; j++) {
+                    TO sum = 0;
+                    // #pragma omp simd reduction(+ : sum)
+                    for (size_t k = 0; k < K; k++) {
+                        sum += static_cast<TO>(a[i * strideA0 + k * strideA1] * b[k * strideB0 + j * strideB1]);
+                    }
+                    y[i * N + j] = beta * y[i * N + j] + alpha * sum;
+                }
+            }
+        }
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_MATMUL_COMMON_CPU_TEMPLATE_HPP
diff --git a/src/04kernel/src/kernels/mat_mul_integer/cpu_kernel.cc b/src/04kernel/src/kernels/mat_mul_integer/cpu_kernel.cc
new file mode 100644
index 000000000..14040f3fc
--- /dev/null
+++ b/src/04kernel/src/kernels/mat_mul_integer/cpu_kernel.cc
@@ -0,0 +1,151 @@
+#include "cpu_kernel.hh"
+#include "../mat_mul_common/cpu_template.hpp"
+#include <execution>
+
+namespace refactor::kernel {
+    using K = MatMulIntegerCpu;
+    using DT = DataType;
+
+    K::MatMulIntegerCpu(decltype(info) info_) noexcept
+        : Kernel(), info(std::move(info_)) {}
+
+    auto K::build(decltype(info) info) noexcept -> KernelBox {
+        return std::make_unique<K>(std::move(info));
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing MatMulInteger using CPU";
+    }
+
+    template<class T> static int8_t sub(T, T);
+    template<> int8_t sub<int8_t>(int8_t a, int8_t b) { return a - b; }
+    template<> int8_t sub<uint8_t>(uint8_t a, uint8_t b) { return static_cast<int8_t>(static_cast<int16_t>(a) - static_cast<int16_t>(b)); }
+
+    template<class T>
+    static void applyZeroPointScalar(
+        size_t size, int8_t *dst, void const *src_, void const *zp_) {
+
+        auto src = reinterpret_cast<T const *>(src_);
+        auto zp = *reinterpret_cast<T const *>(zp_);
+        std::transform(std::execution::par_unseq,
+                       src, src + size,
+                       dst, [zp](auto x) { return sub(x, zp); });
+    }
+    template<class T>
+    static void applyZeroPointA(
+        dim_t b, dim_t m, dim_t n,
+        int8_t *dst, void const *src_, void const *zp_) {
+
+        auto src = reinterpret_cast<T const *>(src_),
+             zp = reinterpret_cast<T const *>(zp_);
+        for (auto i : range0_(b))
+            for (auto j : range0_(m))
+                for (auto k : range0_(n))
+                    dst[i * m * n + j * n + k] = sub(src[i * m * n + j * n + k], zp[i * m + j]);
+    }
+    template<class T>
+    static void applyZeroPointB(
+        dim_t b, dim_t m, dim_t n,
+        int8_t *dst, void const *src_, void const *zp_) {
+
+        auto src = reinterpret_cast<T const *>(src_),
+             zp = reinterpret_cast<T const *>(zp_);
+        for (auto i : range0_(b))
+            for (auto j : range0_(m))
+                for (auto k : range0_(n))
+                    dst[i * m * n + j * n + k] = sub(src[i * m * n + j * n + k], zp[i * n + k]);
+    }
+
+    auto K::lower(Resources &) const noexcept -> RoutineWorkspace {
+        using namespace runtime;
+
+        size_t workspace = 0;
+        if (info.a.withZeroPoint) {
+            workspace += info.batch() * info.m * info.k;
+        }
+        if (info.b.withZeroPoint) {
+            workspace += info.batch() * info.k * info.n;
+        }
+
+        auto routine = [info = info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
+            auto workspacePtr = reinterpret_cast<int8_t *>(workspace);
+            auto a = reinterpret_cast<int8_t const *>(inputs[0]),
+                 b = reinterpret_cast<int8_t const *>(inputs[1]);
+            auto y = reinterpret_cast<int32_t *>(outputs[0]);
+
+            if (auto meta = info.a; meta.withZeroPoint) {
+                auto size = info.batch() * info.m * info.k;
+                auto zp = inputs[2];
+                if (meta.scalar) {
+                    if (meta.signed_) {
+                        applyZeroPointScalar<int8_t>(size, workspacePtr, a, zp);
+                    } else {
+                        applyZeroPointScalar<uint8_t>(size, workspacePtr, a, zp);
+                    }
+                } else {
+                    if (meta.signed_) {
+                        applyZeroPointA<int8_t>(info.batch(), info.m, info.k, workspacePtr, a, zp);
+                    } else {
+                        applyZeroPointA<uint8_t>(info.batch(), info.m, info.k, workspacePtr, a, zp);
+                    }
+                }
+                a = workspacePtr;
+                workspacePtr += size;
+            }
+            if (auto meta = info.b; meta.withZeroPoint) {
+                auto size = info.batch() * info.k * info.n;
+                auto zp = inputs[3];
+                if (meta.scalar) {
+                    if (meta.signed_) {
+                        applyZeroPointScalar<int8_t>(size, workspacePtr, b, zp);
+                    } else {
+                        applyZeroPointScalar<uint8_t>(size, workspacePtr, b, zp);
+                    }
+                } else {
+                    if (meta.signed_) {
+                        applyZeroPointA<int8_t>(info.batch(), info.k, info.n, workspacePtr, b, zp);
+                    } else {
+                        applyZeroPointA<uint8_t>(info.batch(), info.k, info.n, workspacePtr, b, zp);
+                    }
+                }
+                b = workspacePtr;
+            }
+
+            MatMulCPUMetaData<int32_t, int8_t> const md{
+                .M = info.m,
+                .K = info.k,
+                .N = info.n,
+                .strideA0 = info.k,
+                .strideA1 = 1,
+                .strideB0 = info.n,
+                .strideB1 = 1,
+                .alpha = 1,
+                .beta = 0,
+            };
+            auto const stepY = info.m * info.n,
+                       stepA = info.m * info.k,
+                       stepB = info.k * info.n;
+
+            if (info.broadcaster.needBroadcast()) {
+                dim_t offset[2];
+                for (auto i : range0_(info.batch())) {
+                    info.broadcaster.locate(i, offset);
+                    md.matrixMultiply(a + stepA * offset[0], b + stepB * offset[1], y + stepY * i);
+                }
+            } else {
+                for (auto i : range0_(info.batch())) {
+                    md.matrixMultiply(a + stepA * i, b + stepB * i, y + stepY * i);
+                }
+            }
+        };
+
+        return {std::move(routine), workspace};
+    };
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/mat_mul_integer/cpu_kernel.hh b/src/04kernel/src/kernels/mat_mul_integer/cpu_kernel.hh
new file mode 100644
index 000000000..53e6f501c
--- /dev/null
+++ b/src/04kernel/src/kernels/mat_mul_integer/cpu_kernel.hh
@@ -0,0 +1,25 @@
+#ifndef KERNEL_MATMUL_INTEGER_CPU_KERNEL_HH
+#define KERNEL_MATMUL_INTEGER_CPU_KERNEL_HH
+
+#include "kernel/attributes/mat_mul_integer_info.h"
+#include "kernel/kernel.h"
+
+namespace refactor::kernel {
+
+    struct MatMulIntegerCpu final : public Kernel {
+        MatMulIntegerInfo info;
+
+        explicit MatMulIntegerCpu(decltype(info)) noexcept;
+
+        static KernelBox build(decltype(info)) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+
+        RoutineWorkspace lower(Resources &) const noexcept final;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_MATMUL_INTEGER_CPU_KERNEL_HH
diff --git a/src/04kernel/src/kernels/mat_mul_integer/cublas_kernel.cc b/src/04kernel/src/kernels/mat_mul_integer/cublas_kernel.cc
new file mode 100644
index 000000000..d1eeb607e
--- /dev/null
+++ b/src/04kernel/src/kernels/mat_mul_integer/cublas_kernel.cc
@@ -0,0 +1,28 @@
+﻿#include "cublas_kernel.hh"
+
+namespace refactor::kernel {
+    using K = MatMulIntegerCublas;
+    using DT = DataType;
+
+    K::MatMulIntegerCublas(decltype(info) info_) noexcept
+        : Kernel(), info(std::move(info_)) {}
+
+    auto K::build(decltype(info) info) noexcept -> KernelBox {
+#ifndef USE_CUDA
+        return nullptr;
+#endif
+
+        return std::make_unique<K>(std::move(info));
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing MatMulInteger using CUBLAS";
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/mat_mul_integer/cublas_kernel.cu b/src/04kernel/src/kernels/mat_mul_integer/cublas_kernel.cu
new file mode 100644
index 000000000..ccff7001c
--- /dev/null
+++ b/src/04kernel/src/kernels/mat_mul_integer/cublas_kernel.cu
@@ -0,0 +1,186 @@
+﻿#include "../../utilities/cuda/cublas_context.hh"
+#include "cublas_kernel.hh"
+#include <thrust/execution_policy.h>
+#include <thrust/tabulate.h>
+#include <thrust/transform.h>
+
+namespace refactor::kernel {
+    using namespace runtime;
+    using namespace cublas;
+
+    template<class T> __device__ __forceinline__ static int8_t sub(T, T);
+    template<> __device__ __forceinline__ int8_t sub<int8_t>(int8_t a, int8_t b) { return a - b; }
+    template<> __device__ __forceinline__ int8_t sub<uint8_t>(uint8_t a, uint8_t b) {
+        constexpr static int16_t MAX = 127;
+        return static_cast<int8_t>(CUB_MIN(MAX, static_cast<int16_t>(a) - static_cast<int16_t>(b)));
+    }
+
+    template<class T>
+    struct MatMulIntegerZPFunctorScalar {
+        T const *zp;
+
+        __device__ int8_t operator()(T x) const noexcept {
+            return sub(x, *zp);
+        }
+    };
+
+    template<class T>
+    static void applyZeroPointScalar(
+        size_t size, int8_t *dst, void const *src_, void const *zp_) {
+
+        auto src = reinterpret_cast<T const *>(src_),
+             zp = reinterpret_cast<T const *>(zp_);
+        thrust::transform(thrust::device,
+                          src, src + size,
+                          dst, MatMulIntegerZPFunctorScalar<T>{zp});
+    }
+
+    template<class T>
+    struct MatMulIntegerZPFunctor {
+        dim_t m, n, a, b, c;
+        T const *src, *zp;
+
+        __device__ int8_t operator()(size_t idx) const noexcept {
+            auto
+                k = idx % n,
+                j = idx / n % m,
+                i = idx / n / m;
+            return sub(src[idx], zp[i * a + j * b + k * c]);
+        }
+    };
+
+    template<class T>
+    static void applyZeroPointA(
+        dim_t b, dim_t m, dim_t n,
+        int8_t *dst, void const *src_, void const *zp_) {
+        thrust::tabulate(thrust::device,
+                         dst, dst + b * m * n,
+                         MatMulIntegerZPFunctor<T>{
+                             m,
+                             n,
+                             m,
+                             1,
+                             0,
+                             reinterpret_cast<T const *>(src_),
+                             reinterpret_cast<T const *>(zp_),
+                         });
+    }
+
+    template<class T>
+    static void applyZeroPointB(
+        dim_t b, dim_t m, dim_t n,
+        int8_t *dst, void const *src_, void const *zp_) {
+
+        thrust::tabulate(thrust::device,
+                         dst, dst + b * m * n,
+                         MatMulIntegerZPFunctor<T>{
+                             m,
+                             n,
+                             n,
+                             0,
+                             1,
+                             reinterpret_cast<T const *>(src_),
+                             reinterpret_cast<T const *>(zp_),
+                         });
+    }
+
+    auto MatMulIntegerCublas::lower(Resources &res) const noexcept -> RoutineWorkspace {
+
+        size_t workspace = 0;
+        if (info.a.withZeroPoint) {
+            workspace += info.batch() * info.m * info.k;
+        }
+        if (info.b.withZeroPoint) {
+            workspace += info.batch() * info.k * info.n;
+        }
+
+        auto routine = [info = info](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            auto workspacePtr = reinterpret_cast<int8_t *>(workspace);
+            auto a = reinterpret_cast<int8_t const *>(inputs[0]),
+                 b = reinterpret_cast<int8_t const *>(inputs[1]);
+            auto y = reinterpret_cast<int32_t *>(outputs[0]);
+
+            if (auto meta = info.a; meta.withZeroPoint) {
+                auto size = info.batch() * info.m * info.k;
+                auto zp = inputs[2];
+                if (meta.scalar) {
+                    if (meta.signed_) {
+                        applyZeroPointScalar<int8_t>(size, workspacePtr, a, zp);
+                    } else {
+                        applyZeroPointScalar<uint8_t>(size, workspacePtr, a, zp);
+                    }
+                } else {
+                    if (meta.signed_) {
+                        applyZeroPointA<int8_t>(info.batch(), info.m, info.k, workspacePtr, a, zp);
+                    } else {
+                        applyZeroPointA<uint8_t>(info.batch(), info.m, info.k, workspacePtr, a, zp);
+                    }
+                }
+                a = workspacePtr;
+                workspacePtr += size;
+            }
+            if (auto meta = info.b; meta.withZeroPoint) {
+                auto size = info.batch() * info.k * info.n;
+                auto zp = inputs[3];
+                if (meta.scalar) {
+                    if (meta.signed_) {
+                        applyZeroPointScalar<int8_t>(size, workspacePtr, b, zp);
+                    } else {
+                        applyZeroPointScalar<uint8_t>(size, workspacePtr, b, zp);
+                    }
+                } else {
+                    if (meta.signed_) {
+                        applyZeroPointA<int8_t>(info.batch(), info.k, info.n, workspacePtr, b, zp);
+                    } else {
+                        applyZeroPointA<uint8_t>(info.batch(), info.k, info.n, workspacePtr, b, zp);
+                    }
+                }
+                b = workspacePtr;
+            }
+
+            auto handle = res.fetchOrStore<CublasContext>()->handle;
+            int32_t alpha = 1,
+                    beta = 0;
+            auto m = info.m,
+                 n = info.n,
+                 k = info.k;
+            auto strideY = m * n,
+                 strideA = m * k,
+                 strideB = k * n;
+            auto lda = k,
+                 ldb = n;
+            if (info.broadcaster.needBroadcast()) {
+
+                uint32_t offset[2];
+                for (auto i : range0_(info.batch())) {
+                    info.broadcaster.locate(i, offset);
+                    CUBLAS_ASSERT(cublasGemmEx(
+                        handle,
+                        CUBLAS_OP_N, CUBLAS_OP_N,
+                        n, m, k,
+                        &alpha,
+                        b + strideB * offset[1], CUDA_R_8I, ldb,
+                        a + strideA * offset[0], CUDA_R_8I, lda,
+                        &beta, y + strideY * i, CUDA_R_32I, n,
+                        CUDA_R_32I, CUBLAS_GEMM_DEFAULT));
+                }
+            } else {
+
+                CUBLAS_ASSERT(cublasGemmStridedBatchedEx(
+                    handle,
+                    CUBLAS_OP_N, CUBLAS_OP_N,
+                    n, m, k,
+                    &alpha,
+                    b, CUDA_R_8I, ldb, strideB,
+                    a, CUDA_R_8I, lda, strideA,
+                    &beta, y, CUDA_R_32I, n,
+                    strideY, info.batch(),
+                    CUDA_R_32I, CUBLAS_GEMM_DEFAULT));
+            }
+        };
+
+        res.fetchOrStore<CublasContext>();
+        return {std::move(routine), workspace};
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/mat_mul_integer/cublas_kernel.hh b/src/04kernel/src/kernels/mat_mul_integer/cublas_kernel.hh
new file mode 100644
index 000000000..d0d0400a7
--- /dev/null
+++ b/src/04kernel/src/kernels/mat_mul_integer/cublas_kernel.hh
@@ -0,0 +1,26 @@
+﻿#ifndef KERNEL_MATMUL_CUBLAS_KERNEL_HH
+#define KERNEL_MATMUL_CUBLAS_KERNEL_HH
+
+#include "kernel/attributes/mat_mul_integer_info.h"
+#include "kernel/kernel.h"
+
+namespace refactor::kernel {
+
+    struct MatMulIntegerCublas final : public Kernel {
+        MatMulIntegerInfo info;
+
+        explicit MatMulIntegerCublas(decltype(info)) noexcept;
+
+        static KernelBox build(decltype(info)) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_CUDA
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_MATMUL_CUBLAS_KERNEL_HH
diff --git a/src/04kernel/src/kernels/simple_binary/cuda_kernel.cc b/src/04kernel/src/kernels/simple_binary/cuda_kernel.cc
index 3c2351c29..58d5f677e 100644
--- a/src/04kernel/src/kernels/simple_binary/cuda_kernel.cc
+++ b/src/04kernel/src/kernels/simple_binary/cuda_kernel.cc
@@ -154,7 +154,7 @@ extern "C" __global__ void kernel(
         auto op_ = op(opType, dataType);
         auto params = cuda::ThreadsDistributer()(broadcaster.outputsCount);
 
-        if (broadcaster.strides.empty()) {
+        if (!broadcaster.needBroadcast()) {
             auto name = fmt::format("binary{}", postfix);
             auto code = fmt::format(NO_BROADCAST, dt_, op_);
             return [params, h = nvrtc::Handler::compile(name.c_str(), code.c_str(), "kernel")]//
diff --git a/src/04kernel/src/kernels/simple_unary/cudnn_activation_kernel.cc b/src/04kernel/src/kernels/simple_unary/cudnn_activation_kernel.cc
index 92662a61a..ec3ad5698 100644
--- a/src/04kernel/src/kernels/simple_unary/cudnn_activation_kernel.cc
+++ b/src/04kernel/src/kernels/simple_unary/cudnn_activation_kernel.cc
@@ -64,28 +64,25 @@ namespace refactor::kernel {
         auto d = std::make_shared<Descriptors>();
 
         // clang-format off
-        cudnnActivationMode_t
-        mode = type == Ty::Relu    ? CUDNN_ACTIVATION_RELU
-             : type == Ty::Sigmoid ? CUDNN_ACTIVATION_SIGMOID
-             : type == Ty::Tanh    ? CUDNN_ACTIVATION_TANH
-             : UNREACHABLEX(cudnnActivationMode_t, "");
+        auto mode = type == Ty::Relu    ? CUDNN_ACTIVATION_RELU
+                  : type == Ty::Sigmoid ? CUDNN_ACTIVATION_SIGMOID
+                  : type == Ty::Tanh    ? CUDNN_ACTIVATION_TANH
+                  : UNREACHABLEX(cudnnActivationMode_t, "");
         // clang-format on
 
+        setCudnnTensor(d->tensor, dataType, slice(&size, 1));
         CUDNN_ASSERT(cudnnSetActivationDescriptor(d->activation, mode, CUDNN_PROPAGATE_NAN, 0.0));
-        CUDNN_ASSERT(cudnnSetTensor4dDescriptor(d->tensor, CUDNN_TENSOR_NCHW, cudnnDataTypeConvert(dataType), 1, 1, 1, size));
 
         res.fetchOrStore<CudnnContext>();
-        // nvcc at c++11 doesn't support real move capture
-        return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
-            // fetch cudnn handle from resources
-            auto handle = res.fetchOrStore<CudnnContext>()->handle;
-            // name inputs and outputs
-            auto x = inputs[0];
-            auto y = outputs[0];
-            // call cudnn activation
-            float alpha = 1, beta = 0;
-            CUDNN_ASSERT(cudnnActivationForward(handle, d->activation, &alpha, d->tensor, x, &beta, d->tensor, y));
-        };
+        return [d = std::move(d)]//
+            (Resources & res, void *, void const *const *inputs, void *const *outputs) {
+                float alpha = 1, beta = 0;
+                CUDNN_ASSERT(cudnnActivationForward(
+                    res.fetchOrStore<CudnnContext>()->handle,
+                    d->activation,
+                    &alpha, d->tensor, inputs[0],
+                    &beta, d->tensor, outputs[0]));
+            };
     }
 
 #endif
diff --git a/src/04kernel/src/kernels/softmax/cuda_kernel.cu b/src/04kernel/src/kernels/softmax/cuda_kernel.cu
index 7c65ea029..114cb453a 100644
--- a/src/04kernel/src/kernels/softmax/cuda_kernel.cu
+++ b/src/04kernel/src/kernels/softmax/cuda_kernel.cu
@@ -4,9 +4,6 @@
 namespace refactor::kernel {
     using namespace runtime;
 
-    template<class T>
-    __device__ __forceinline__ T max_(T a, T b) { return a > b ? a : b; }
-
     template<class T>
     __device__ __forceinline__ T exp_(T x);
     template<> __device__ __forceinline__ float exp_<float>(float x) { return expf(x); }
@@ -58,16 +55,6 @@ namespace refactor::kernel {
         }
     }
 
-    template<class T> struct SumOp {
-        __device__ __forceinline__ T operator()(T const &a, T const &b) const {
-            return a + b;
-        }
-    };
-    template<class T> struct MaxOp {
-        __device__ __forceinline__ T operator()(T const &a, T const &b) const {
-            return max_(a, b);
-        }
-    };
     template<class T, class ReductionOp>
     __device__ __forceinline__ T WarpAllReduce(T val, ReductionOp op) {
         for (int mask = blockDim.x >> 1; mask > 0; mask >>= 1) {
@@ -92,9 +79,9 @@ namespace refactor::kernel {
 
             T maxData = -__FLT_MAX__;
             for (int i = threadIdx.x; i < dimsize; i += blockDim.x) {
-                maxData = max_(maxData, input[tid + i * stride]);
+                maxData = CUB_MAX(maxData, input[tid + i * stride]);
             }
-            maxData = WarpAllReduce(maxData, MaxOp<T>{});
+            maxData = WarpAllReduce(maxData, cub::Max());
             if (threadIdx.x == 0) {
                 maxTotal[threadIdx.y] = maxData;
             }
@@ -104,7 +91,7 @@ namespace refactor::kernel {
             for (int i = threadIdx.x; i < dimsize; i += blockDim.x) {
                 sumData += exp_(input[tid + i * stride] - maxTotal[threadIdx.y]);
             }
-            sumData = WarpAllReduce(sumData, SumOp<T>{});
+            sumData = WarpAllReduce(sumData, cub::Sum());
             if (threadIdx.x == 0) {
                 sumTotal[threadIdx.y] = sumData;
             }
diff --git a/src/04kernel/src/kernels/softmax/cudnn_kernel.cc b/src/04kernel/src/kernels/softmax/cudnn_kernel.cc
index 0536073a2..cff6d26bb 100644
--- a/src/04kernel/src/kernels/softmax/cudnn_kernel.cc
+++ b/src/04kernel/src/kernels/softmax/cudnn_kernel.cc
@@ -58,11 +58,8 @@ namespace refactor::kernel {
         auto d = std::make_shared<Descriptors>(
             static_cast<cudnnSoftmaxAlgorithm_t>(algo),
             dataType != DataType::F64);
-        CUDNN_ASSERT(cudnnSetTensor4dDescriptor(
-            d->t,
-            CUDNN_TENSOR_NCHW,
-            cudnnDataTypeConvert(dataType),
-            pre, mid, post, 1));
+        int dims[]{pre, mid, post, 1};
+        setCudnnTensor(d->t, dataType, slice(dims, 4));
 
         res.fetchOrStore<CudnnContext>();
         return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
diff --git a/src/04kernel/src/utilities/cuda/cublas_context.hh b/src/04kernel/src/utilities/cuda/cublas_context.hh
index 1013fe972..69f56cd4a 100644
--- a/src/04kernel/src/utilities/cuda/cublas_context.hh
+++ b/src/04kernel/src/utilities/cuda/cublas_context.hh
@@ -4,6 +4,13 @@
 #include "runtime/resource.h"
 #include <cublas_v2.h>
 
+#define CUBLAS_ASSERT(STATUS)                                      \
+    if (auto status = (STATUS); status != CUBLAS_STATUS_SUCCESS) { \
+        fmt::println("cublas failed on \"" #STATUS "\" with {}",   \
+                     (int) status);                                \
+        abort();                                                   \
+    }
+
 namespace refactor::kernel::cublas {
 
     struct CublasContext final : public runtime::Resource {
diff --git a/src/04kernel/src/utilities/cuda/cudnn_functions.cpp b/src/04kernel/src/utilities/cuda/cudnn_functions.cpp
new file mode 100644
index 000000000..1beeaaade
--- /dev/null
+++ b/src/04kernel/src/utilities/cuda/cudnn_functions.cpp
@@ -0,0 +1,38 @@
+﻿#ifdef USE_CUDA
+
+#include "cudnn_functions.h"
+
+namespace refactor::kernel::cudnn {
+
+    cudnnDataType_t cudnnDataTypeConvert(DataType dataType) {
+        // clang-format off
+        switch (dataType) {
+            case DataType::F32 : return CUDNN_DATA_FLOAT;    break;
+            case DataType::F64 : return CUDNN_DATA_DOUBLE;   break;
+            case DataType::FP16: return CUDNN_DATA_HALF;     break;
+            case DataType::I8  : return CUDNN_DATA_INT8;     break;
+            case DataType::I32 : return CUDNN_DATA_INT32;    break;
+            case DataType::U8  : return CUDNN_DATA_UINT8;    break;
+            case DataType::BF16: return CUDNN_DATA_BFLOAT16; break;
+            case DataType::I64 : return CUDNN_DATA_INT64;    break;
+            case DataType::Bool: return CUDNN_DATA_BOOLEAN;  break;
+            default: UNREACHABLE();
+        }
+        // clang-format on
+    }
+
+    void setCudnnTensor(cudnnTensorDescriptor_t t, DataType dt, slice_t<int> d) {
+        auto dt_ = cudnnDataTypeConvert(dt);
+        if (auto n = d.size(); n == 4) {
+            CUDNN_ASSERT(cudnnSetTensor4dDescriptor(t, CUDNN_TENSOR_NCHW, dt_, d[0], d[1], d[2], d[3]));
+        } else if (n < 4) {
+            int d_[]{1, 1, 1, 1};
+            std::copy_n(d.begin(), n, d_ + 4 - n);
+            CUDNN_ASSERT(cudnnSetTensor4dDescriptor(t, CUDNN_TENSOR_NCHW, dt_, d_[0], d_[1], d_[2], d_[3]));
+        } else {
+            CUDNN_ASSERT(cudnnSetTensorNdDescriptorEx(t, CUDNN_TENSOR_NCHW, dt_, d.size(), d.begin()));
+        }
+    }
+}// namespace refactor::kernel::cudnn
+
+#endif
diff --git a/src/04kernel/src/utilities/cuda/cudnn_functions.cu b/src/04kernel/src/utilities/cuda/cudnn_functions.cu
deleted file mode 100644
index 62781f950..000000000
--- a/src/04kernel/src/utilities/cuda/cudnn_functions.cu
+++ /dev/null
@@ -1,35 +0,0 @@
-﻿#include "cudnn_functions.h"
-
-namespace refactor::kernel::cudnn {
-    using DT = DataType;
-
-    cudnnDataType_t cudnnDataTypeConvert(DT dataType) {
-        switch (dataType) {
-                // clang-format off
-            case DT::F32 : return CUDNN_DATA_FLOAT;    break;
-            case DT::F64 : return CUDNN_DATA_DOUBLE;   break;
-            case DT::FP16: return CUDNN_DATA_HALF;     break;
-            case DT::I8  : return CUDNN_DATA_INT8;     break;
-            case DT::I32 : return CUDNN_DATA_INT32;    break;
-            case DT::U8  : return CUDNN_DATA_UINT8;    break;
-            case DT::BF16: return CUDNN_DATA_BFLOAT16; break;
-            case DT::I64 : return CUDNN_DATA_INT64;    break;
-            case DT::Bool: return CUDNN_DATA_BOOLEAN;  break;
-            default: UNREACHABLE();
-                // clang-format on
-        }
-    }
-
-    void setCudnnTensor(cudnnTensorDescriptor_t t, DT dt, slice_t<int> d) {
-        auto dt_ = cudnnDataTypeConvert(dt);
-        if (auto n = d.size(); n == 4) {
-            CUDNN_ASSERT(cudnnSetTensor4dDescriptor(t, CUDNN_TENSOR_NCHW, dt_, d[0], d[1], d[2], d[3]));
-        } else if (n < 4) {
-            int d_[]{1, 1, 1, 1};
-            std::copy_n(d.begin(), n, d_ + 4 - n);
-            CUDNN_ASSERT(cudnnSetTensor4dDescriptor(t, CUDNN_TENSOR_NCHW, dt_, d_[0], d_[1], d_[2], d_[3]));
-        } else {
-            CUDNN_ASSERT(cudnnSetTensorNdDescriptorEx(t, CUDNN_TENSOR_NCHW, dt_, d.size(), d.begin()));
-        }
-    }
-}// namespace refactor::kernel::cudnn
diff --git a/src/04kernel/test/kernels/dequantize_linear/test_cpu.cpp b/src/04kernel/test/kernels/dequantize_linear/test_cpu.cpp
new file mode 100644
index 000000000..831dda424
--- /dev/null
+++ b/src/04kernel/test/kernels/dequantize_linear/test_cpu.cpp
@@ -0,0 +1,31 @@
+﻿#include "../../../src/kernels/dequantize_linear/cpu_kernel.hh"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+
+TEST(kernel, DequantizeLinearCpu) {
+    // build routine
+    auto x = Tensor::share(DataType::U8, {4});
+    auto scale = Tensor::share(DataType::F32, {});
+    auto zeroPoint = Tensor::share(DataType::U8, {});
+    auto y = Tensor::share(DataType::F32, {4});
+    auto kernel = DequantizeLinearCpu::build({*x, *scale, *zeroPoint}, *y);
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<uint8_t> xData{0, 3, 128, 255};
+    float scale_ = 2;
+    uint8_t zp_ = 128;
+    std::vector<float> yData(xData.size());
+    // inference
+    {
+        void const *inputs[]{xData.data(), &scale_, &zp_};
+        void *outputs[]{yData.data()};
+        routine(res, nullptr, inputs, outputs);
+    }
+    // check
+    ASSERT_EQ(yData, (decltype(yData){-256, -250, 0, 254}));
+}
diff --git a/src/04kernel/test/kernels/dequantize_linear/test_cuda.cpp b/src/04kernel/test/kernels/dequantize_linear/test_cuda.cpp
new file mode 100644
index 000000000..87ec6fd95
--- /dev/null
+++ b/src/04kernel/test/kernels/dequantize_linear/test_cuda.cpp
@@ -0,0 +1,57 @@
+﻿#ifdef USE_CUDA
+
+#include "../../../src/kernels/dequantize_linear/cpu_kernel.hh"
+#include "../../../src/kernels/dequantize_linear/cuda_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, DequantizeLinearCuda) {
+    // build routine
+    auto x = Tensor::share(DataType::U8, {4});
+    auto scale = Tensor::share(DataType::F32, {});
+    auto zeroPoint = Tensor::share(DataType::U8, {});
+    auto y = Tensor::share(DataType::F32, {4});
+    auto kernel = DequantizeLinearCuda::build({*x, *scale, *zeroPoint}, *y),
+         kCpu = DequantizeLinearCpu::build({*x, *scale, *zeroPoint}, *y);
+    ASSERT_TRUE(kernel && kCpu);
+    auto res = runtime::Resources();
+    auto [routine, workspaceSize] = kernel->lower(res);
+    auto rCpu = kCpu->lower(res).routine;
+    // malloc
+    auto &dev = *device::init(Device::Type::Nvidia, 0, "");
+    auto xGpu = dev.malloc(x->bytesSize()),
+         scaleGpu = dev.malloc(sizeof(float)),
+         zpGpu = dev.malloc(sizeof(uint8_t)),
+         yGpu = dev.malloc(y->bytesSize());
+    // put input data
+    std::vector<uint8_t> xData{0, 3, 128, 255};
+    float scale_ = 2;
+    uint8_t zp_ = 128;
+    std::vector<float> yData(xData.size());
+    xGpu->copyFromHost(xData.data());
+    scaleGpu->copyFromHost(&scale_);
+    zpGpu->copyFromHost(&zp_);
+    // inference
+    {
+        void const *inputs[]{*xGpu, *scaleGpu, *zpGpu};
+        void *outputs[]{*yGpu};
+        routine(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{xData.data(), &scale_, &zp_};
+        void *outputs[]{yData.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // check
+    {
+        std::vector<float> result(yData.size());
+        yGpu->copyToHost(result.data());
+        EXPECT_EQ(result, yData);
+    }
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/dynamic_quantize_linear/test_cpu.cpp b/src/04kernel/test/kernels/dynamic_quantize_linear/test_cpu.cpp
new file mode 100644
index 000000000..4101e1895
--- /dev/null
+++ b/src/04kernel/test/kernels/dynamic_quantize_linear/test_cpu.cpp
@@ -0,0 +1,31 @@
+﻿#include "../../../src/kernels/dynamic_quantize_linear/cpu_kernel.hh"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+
+TEST(kernel, DynamicQuantizeLinearCpu) {
+    // build routine
+    auto kernel = DynamicQuantizeLinearCpu::build(6);
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<float> x{0, 2, -3, -2.5, 1.34, 0.5};
+    std::vector<uint8_t> y(x.size());
+    float scale;
+    uint8_t zeroPoint;
+    // inference
+    {
+        void const *inputs[]{x.data()};
+        void *outputs[]{y.data(), &scale, &zeroPoint};
+        routine(res, nullptr, inputs, outputs);
+    }
+    // check
+    EXPECT_FLOAT_EQ(scale, (2 + 3) / 255.f);
+    EXPECT_EQ(zeroPoint, 153);
+    for (auto i : range0_(y.size())) {
+        EXPECT_EQ(y[i], static_cast<uint8_t>(std::round(x[i] / scale) + zeroPoint));
+    }
+}
diff --git a/src/04kernel/test/kernels/dynamic_quantize_linear/test_cuda.cpp b/src/04kernel/test/kernels/dynamic_quantize_linear/test_cuda.cpp
new file mode 100644
index 000000000..2d99f6476
--- /dev/null
+++ b/src/04kernel/test/kernels/dynamic_quantize_linear/test_cuda.cpp
@@ -0,0 +1,62 @@
+﻿#ifdef USE_CUDA
+
+#include "../../../src/kernels/dynamic_quantize_linear/cpu_kernel.hh"
+#include "../../../src/kernels/dynamic_quantize_linear/cuda_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, DynamicQuantizeLinearCuda) {
+    auto size = 20;
+    // build routine
+    auto kernel = DynamicQuantizeLinearCuda::build(size),
+         kCpu = DynamicQuantizeLinearCpu::build(size);
+    ASSERT_TRUE(kernel && kCpu);
+    auto res = runtime::Resources();
+    auto [routine, workspaceSize] = kernel->lower(res);
+    auto rCpu = kCpu->lower(res).routine;
+    // malloc
+    auto &dev = *device::init(Device::Type::Nvidia, 0, "");
+    auto xGpu = dev.malloc(size * sizeof(float)),
+         yGpu = dev.malloc(size * sizeof(uint8_t)),
+         scaleGpu = dev.malloc(sizeof(float)),
+         zpGpu = dev.malloc(sizeof(uint8_t)),
+         workspace = dev.malloc(workspaceSize);
+    // put input data
+    std::vector<float> x(size);
+    std::vector<uint8_t> y(size);
+    float scale;
+    uint8_t zeroPoint;
+    for (auto i : range0_(size)) {
+        x[i] = i * 3 + 15;
+    }
+    xGpu->copyFromHost(x.data());
+    // inference
+    {
+        void const *inputs[]{*xGpu};
+        void *outputs[]{*yGpu, *scaleGpu, *zpGpu};
+        routine(res, *workspace, inputs, outputs);
+    }
+    {
+        void const *inputs[]{x.data()};
+        void *outputs[]{y.data(), &scale, &zeroPoint};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // check
+    {
+        std::vector<uint8_t> result(size);
+        yGpu->copyToHost(result.data());
+        EXPECT_EQ(result, y);
+        float scale_;
+        scaleGpu->copyToHost(&scale_);
+        EXPECT_EQ(scale_, scale);
+        uint8_t zp_;
+        zpGpu->copyToHost(&zp_);
+        EXPECT_EQ(zp_, zeroPoint);
+    }
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/mat_mul/test_cpu.cpp b/src/04kernel/test/kernels/mat_mul/test_cpu.cpp
new file mode 100644
index 000000000..f6e8f3147
--- /dev/null
+++ b/src/04kernel/test/kernels/mat_mul/test_cpu.cpp
@@ -0,0 +1,74 @@
+#include "../src/kernels/mat_mul/cpu_kernel.hh"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+
+template<class T>
+static void check(
+    Resources &&res,
+    Routine &&routine,
+    std::vector<T> ans,
+    std::vector<T> const &a,
+    std::vector<T> const &b,
+    std::vector<T> const &c) {
+    std::vector<T> result(ans.size());
+    // inference
+    void const *inputs[]{a.data(), b.data(), c.data()};
+    void *outputs[]{result.data()};
+    routine(res, nullptr, inputs, outputs);
+    // check
+    EXPECT_EQ(result, ans);
+}
+
+TEST(kernel, MatMulCPU_WithBias) {
+    // build routine
+    auto A = Tensor::share(DataType::F32, Shape{1, 2, 2});
+    auto B = Tensor::share(DataType::F32, Shape{2, 2});
+    auto C = Tensor::share(DataType::F32, Shape{});
+    auto Y = Tensor::share(DataType::F32, Shape{2, 2});
+    auto kernel = MatMulCPU::build(MatMulInfo(*A, *B, *C, false, false, 1, 1));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    check<float>(std::move(res), kernel->lower(res).routine,
+                 {2, 4, 1, 1.25},
+                 {1.0, 2.0, 0.0, 0.5},
+                 {1.0, 2.0, 0.0, 0.5},
+                 {1.0});
+}
+
+TEST(kernel, MatMulCPU_UINT16NoBias) {
+    // build routine
+    auto A = Tensor::share(DataType::U16, Shape{2, 2});
+    auto B = Tensor::share(DataType::U16, Shape{2, 2});
+    auto Y = Tensor::share(DataType::U16, Shape{2, 2});
+    auto kernel = MatMulCPU::build(MatMulInfo(*A, *B, std::nullopt, false, false, 1, 1));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    check<uint16_t>(std::move(res), kernel->lower(res).routine,
+                    {7, 6, 2, 3},
+                    {3, 2, 0, 1},
+                    {1, 0, 2, 3},
+                    {});
+}
+
+TEST(kernel, MatMulCPU_Broadcast) {
+    // build routine
+    auto A = Tensor::share(DataType::F32, Shape{2, 1, 2, 2});
+    auto B = Tensor::share(DataType::F32, Shape{1, 2, 2, 2});
+    auto C = Tensor::share(DataType::F32, Shape{2, 1});
+    auto Y = Tensor::share(DataType::F32, Shape{2, 2, 2, 2});
+    auto kernel = MatMulCPU::build(MatMulInfo(*A, *B, *C, false, false, 1, 1));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    check<float>(std::move(res), kernel->lower(res).routine,
+                 {2.0, 4.0, 0.0, 0.25,
+                  2.0, 3.0, 0.0, 0.5,
+                  2.0, 3.0, 0.0, 0.5,
+                  2.0, 1.0, 0.0, 1.0},
+                 {1.0, 2.0, 0.0, 0.5,
+                  1.0, 0.0, 0.0, 1.0},
+                 {1.0, 2.0, 0.0, 0.5,
+                  1.0, 0.0, 0.0, 1.0},
+                 {1.0, 0.0});
+}
diff --git a/src/04kernel/test/kernels/matmul/test_matmul_cublas.cpp b/src/04kernel/test/kernels/mat_mul/test_cublas.cpp
similarity index 96%
rename from src/04kernel/test/kernels/matmul/test_matmul_cublas.cpp
rename to src/04kernel/test/kernels/mat_mul/test_cublas.cpp
index 7f96d5f27..ef1328f20 100644
--- a/src/04kernel/test/kernels/matmul/test_matmul_cublas.cpp
+++ b/src/04kernel/test/kernels/mat_mul/test_cublas.cpp
@@ -92,9 +92,7 @@ TEST(kernel, MatMulCublas_Broadcast) {
     std::vector<float> result(Y->elementsSize());
     my->copyToHost(result.data(), Y->bytesSize());
     // check
-    for (auto i : range0_(result.size())) {
-        EXPECT_FLOAT_EQ(result[i], cpuOut[i]);
-    }
+    EXPECT_EQ(result, cpuOut);
 }
 
 TEST(kernel, MatMulCublas_TransABNoBias) {
@@ -137,9 +135,7 @@ TEST(kernel, MatMulCublas_TransABNoBias) {
     std::vector<float> result(Y->elementsSize());
     my->copyToHost(result.data(), Y->bytesSize());
     // check
-    for (auto i : range0_(result.size())) {
-        EXPECT_FLOAT_EQ(result[i], cpuOut[i]);
-    }
+    EXPECT_EQ(result, cpuOut);
 }
 
 TEST(kernel, MatMulCublas_Large) {
@@ -192,9 +188,7 @@ TEST(kernel, MatMulCublas_Large) {
     std::vector<float> result(Y->elementsSize());
     my->copyToHost(result.data(), Y->bytesSize());
     // check
-    for (auto i : range0_(result.size())) {
-        EXPECT_FLOAT_EQ(result[i], cpuOut[i]);
-    }
+    EXPECT_EQ(result, cpuOut);
 }
 
 #endif
diff --git a/src/04kernel/test/kernels/mat_mul_integer/test_cpu_kernel.cpp b/src/04kernel/test/kernels/mat_mul_integer/test_cpu_kernel.cpp
new file mode 100644
index 000000000..82cf8b4f1
--- /dev/null
+++ b/src/04kernel/test/kernels/mat_mul_integer/test_cpu_kernel.cpp
@@ -0,0 +1,32 @@
+#include "../src/kernels/mat_mul_integer/cpu_kernel.hh"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace kernel;
+
+TEST(kernel, MatMulIntegerCpu) {
+    // build routine
+    auto A = Tensor::share(DataType::U8, Shape{2, 3});
+    auto B = Tensor::share(DataType::U8, Shape{3, 1});
+    auto Y = Tensor::share(DataType::I32, Shape{2, 1});
+    auto kernel = MatMulIntegerCpu::build(MatMulIntegerInfo(TensorRefs{*A, *B}));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto [routine, workspaceSize] = kernel->lower(res);
+    // put input data
+    std::vector<uint8_t>
+        dataA{1, 2, 3, 4, 5, 6},
+        dataB{1, 2, 3},
+        workspace(workspaceSize);
+    std::vector<int32_t>
+        result(Y->elementsSize()),
+        ans{14, 32};
+    // inference
+    {
+        void const *inputs[]{dataA.data(), dataB.data()};
+        void *outputs[]{result.data()};
+        routine(res, workspace.data(), inputs, outputs);
+    }
+    // check
+    EXPECT_EQ(result, ans);
+}
diff --git a/src/04kernel/test/kernels/mat_mul_integer/test_cublas_kernel.cpp b/src/04kernel/test/kernels/mat_mul_integer/test_cublas_kernel.cpp
new file mode 100644
index 000000000..55ff188e4
--- /dev/null
+++ b/src/04kernel/test/kernels/mat_mul_integer/test_cublas_kernel.cpp
@@ -0,0 +1,61 @@
+#ifdef USE_CUDA
+
+#include "../src/kernels/mat_mul_integer/cpu_kernel.hh"
+#include "../src/kernels/mat_mul_integer/cublas_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, MatMulIntegerCublas) {
+    // build routine
+    auto A = Tensor::share(DataType::U8, Shape{1, 4});
+    auto B = Tensor::share(DataType::U8, Shape{4, 12});
+    auto Y = Tensor::share(DataType::I32, Shape{1, 12});
+    MatMulIntegerInfo info(TensorRefs{*A, *B});
+    auto cpuKernel = MatMulIntegerCpu::build(info);
+    auto gpuKernel = MatMulIntegerCublas::build(info);
+    ASSERT_TRUE(cpuKernel && gpuKernel);
+    auto res = runtime::Resources();
+    auto [cpuRoutine, workspace] = cpuKernel->lower(res);
+    auto [gpuRoutine, workspace_] = gpuKernel->lower(res);
+    ASSERT_EQ(workspace, workspace_);
+    // put input data
+    std::vector<uint8_t>
+        dataA(A->elementsSize()),
+        dataB(B->elementsSize());
+    std::vector<int32_t>
+        dataY(Y->elementsSize()),
+        result(Y->elementsSize());
+    std::iota(dataA.begin(), dataA.end(), 1);
+    std::iota(dataB.data() + 0, dataB.data() + 12, 1);
+    std::iota(dataB.data() + 12, dataB.data() + 24, 1);
+    std::iota(dataB.data() + 24, dataB.data() + 36, 1);
+    std::iota(dataB.data() + 36, dataB.data() + 48, 1);
+    auto &dev = *device::init(Device::Type::Nvidia, 0, "");
+    auto ma = dev.malloc(A->bytesSize()),
+         mb = dev.malloc(B->bytesSize()),
+         my = dev.malloc(Y->bytesSize());
+    ma->copyFromHost(dataA.data(), A->bytesSize());
+    mb->copyFromHost(dataB.data(), B->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*ma, *mb};
+        void *outputs[]{*my};
+        gpuRoutine(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{dataA.data(), dataB.data()};
+        void *outputs[]{dataY.data()};
+        cpuRoutine(res, nullptr, inputs, outputs);
+    }
+    // take output data
+    my->copyToHost(result.data(), Y->bytesSize());
+    // check
+    EXPECT_EQ(result, dataY);
+}
+
+#endif
diff --git a/src/04kernel/test/kernels/matmul/test_matmul_cpu.cpp b/src/04kernel/test/kernels/matmul/test_matmul_cpu.cpp
deleted file mode 100644
index 0228d1fb7..000000000
--- a/src/04kernel/test/kernels/matmul/test_matmul_cpu.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-#include "../src/kernels/mat_mul/cpu_kernel.hh"
-#include <gtest/gtest.h>
-
-using namespace refactor;
-using namespace kernel;
-
-TEST(kernel, MatMulCPU_WithBias) {
-    // build routine
-    auto A = Tensor::share(DataType::F32, Shape{1, 2, 2});
-    auto B = Tensor::share(DataType::F32, Shape{2, 2});
-    auto C = Tensor::share(DataType::F32, Shape{});
-    auto Y = Tensor::share(DataType::F32, Shape{2, 2});
-    auto kernel = MatMulCPU::build(MatMulInfo(*A, *B, *C, false, false, 1, 1));
-    ASSERT_TRUE(kernel);
-    auto res = runtime::Resources();
-    auto routine = kernel->lower(res).routine;
-    // put input data
-    std::vector<float>
-        dataA{1.0, 2.0, 0.0, 0.5},
-        dataB{1.0, 2.0, 0.0, 0.5},
-        dataC{1.0},
-        result(Y->elementsSize()),
-        ans{2, 4, 1, 1.25};
-    // inference
-    {
-        void const *inputs[]{dataA.data(), dataB.data(), dataC.data()};
-        void *outputs[]{result.data()};
-        routine(res, nullptr, inputs, outputs);
-    }
-    // check
-    for (auto i : range0_(result.size())) {
-        EXPECT_FLOAT_EQ(result[i], ans[i]);
-    }
-}
-
-TEST(kernel, MatMulCPU_UINT16NoBias) {
-    // build routine
-    auto A = Tensor::share(DataType::U16, Shape{2, 2});
-    auto B = Tensor::share(DataType::U16, Shape{2, 2});
-    auto Y = Tensor::share(DataType::U16, Shape{2, 2});
-    auto kernel = MatMulCPU::build(MatMulInfo(*A, *B, std::nullopt, false, false, 1, 1));
-    ASSERT_TRUE(kernel);
-    auto res = runtime::Resources();
-    auto routine = kernel->lower(res).routine;
-    // put input data
-    std::vector<uint16_t>
-        dataA{3, 2, 0, 1},
-        dataB{1, 0, 2, 3},
-        result(Y->elementsSize()),
-        ans{7, 6, 2, 3};
-    // inference
-    {
-        void const *inputs[]{dataA.data(), dataB.data()};
-        void *outputs[]{result.data()};
-        routine(res, nullptr, inputs, outputs);
-    }
-    // check
-    for (auto i : range0_(result.size())) {
-        EXPECT_EQ(result[i], ans[i]);
-    }
-}
-
-TEST(kernel, MatMulCPU_Broadcast) {
-    // build routine
-    auto A = Tensor::share(DataType::F32, Shape{2, 1, 2, 2});
-    auto B = Tensor::share(DataType::F32, Shape{1, 2, 2, 2});
-    auto C = Tensor::share(DataType::F32, Shape{2, 1});
-    auto Y = Tensor::share(DataType::F32, Shape{2, 2, 2, 2});
-    auto kernel = MatMulCPU::build(MatMulInfo(*A, *B, *C, false, false, 1, 1));
-    ASSERT_TRUE(kernel);
-    auto res = runtime::Resources();
-    auto routine = kernel->lower(res).routine;
-    // put input data
-    std::vector<float>
-        dataA{1.0, 2.0, 0.0, 0.5,
-              1.0, 0.0, 0.0, 1.0},
-        dataB{1.0, 2.0, 0.0, 0.5,
-              1.0, 0.0, 0.0, 1.0},
-        dataC{1.0, 0.0},
-        result(Y->elementsSize()),
-        ans{2.0, 4.0, 0.0, 0.25,
-            2.0, 3.0, 0.0, 0.5,
-            2.0, 3.0, 0.0, 0.5,
-            2.0, 1.0, 0.0, 1.0};
-    // inference
-    {
-        void const *inputs[]{dataA.data(), dataB.data(), dataC.data()};
-        void *outputs[]{result.data()};
-        routine(res, nullptr, inputs, outputs);
-    }
-    // check
-    for (auto i : range0_(result.size())) {
-        EXPECT_FLOAT_EQ(result[i], ans[i]);
-    }
-}
diff --git a/src/05computation/include/computation/operators/cast.h b/src/05computation/include/computation/operators/cast.h
index 3af10655c..1c9bc10bd 100644
--- a/src/05computation/include/computation/operators/cast.h
+++ b/src/05computation/include/computation/operators/cast.h
@@ -7,7 +7,7 @@ namespace refactor::computation {
 
     struct Cast final : public Operator {
 
-        constexpr explicit Cast() noexcept : Operator() {}
+        constexpr Cast() noexcept = default;
 
         static size_t typeId() noexcept;
         size_t opTypeId() const noexcept final;
diff --git a/src/05computation/include/computation/operators/dequantize_linear.h b/src/05computation/include/computation/operators/dequantize_linear.h
new file mode 100644
index 000000000..321b6449b
--- /dev/null
+++ b/src/05computation/include/computation/operators/dequantize_linear.h
@@ -0,0 +1,21 @@
+﻿#ifndef COMPUTATION_DEQUANTIZE_LINEAR_H
+#define COMPUTATION_DEQUANTIZE_LINEAR_H
+
+#include "../operator.h"
+
+namespace refactor::computation {
+
+    struct DequantizeLinear final : public Operator {
+
+        constexpr DequantizeLinear() noexcept = default;
+
+        static size_t typeId() noexcept;
+        size_t opTypeId() const noexcept final;
+        std::string_view name() const noexcept final;
+        kernel::CollectorBox candidateKernels(Target) const noexcept final;
+        std::string serialize() const noexcept final;
+    };
+
+}// namespace refactor::computation
+
+#endif// COMPUTATION_DEQUANTIZE_LINEAR_H
diff --git a/src/05computation/include/computation/operators/dynamic_quantize_linear.h b/src/05computation/include/computation/operators/dynamic_quantize_linear.h
new file mode 100644
index 000000000..6f57fdd71
--- /dev/null
+++ b/src/05computation/include/computation/operators/dynamic_quantize_linear.h
@@ -0,0 +1,21 @@
+﻿#ifndef COMPUTATION_DYNAMIC_QUANTIZE_LINEAR_H
+#define COMPUTATION_DYNAMIC_QUANTIZE_LINEAR_H
+
+#include "../operator.h"
+
+namespace refactor::computation {
+
+    struct DynamicQuantizeLinear final : public Operator {
+
+        constexpr DynamicQuantizeLinear() noexcept = default;
+
+        static size_t typeId() noexcept;
+        size_t opTypeId() const noexcept final;
+        std::string_view name() const noexcept final;
+        kernel::CollectorBox candidateKernels(Target) const noexcept final;
+        std::string serialize() const noexcept final;
+    };
+
+}// namespace refactor::computation
+
+#endif// COMPUTATION_DYNAMIC_QUANTIZE_LINEAR_H
diff --git a/src/05computation/include/computation/operators/identity.h b/src/05computation/include/computation/operators/identity.h
index cf23bf92a..4f887321f 100644
--- a/src/05computation/include/computation/operators/identity.h
+++ b/src/05computation/include/computation/operators/identity.h
@@ -6,7 +6,8 @@
 namespace refactor::computation {
 
     struct Identity final : public Operator {
-        constexpr Identity() noexcept : Operator() {}
+
+        constexpr Identity() noexcept = default;
 
         static size_t typeId() noexcept;
         size_t opTypeId() const noexcept final;
diff --git a/src/05computation/include/computation/operators/mat_mul_integer.h b/src/05computation/include/computation/operators/mat_mul_integer.h
new file mode 100644
index 000000000..262469974
--- /dev/null
+++ b/src/05computation/include/computation/operators/mat_mul_integer.h
@@ -0,0 +1,21 @@
+﻿#ifndef COMPUTATION_MAT_MUL_INTEGER_H
+#define COMPUTATION_MAT_MUL_INTEGER_H
+
+#include "../operator.h"
+
+namespace refactor::computation {
+
+    struct MatMulInteger final : public LayoutDependentOperator {
+
+        constexpr MatMulInteger() noexcept = default;
+
+        static size_t typeId() noexcept;
+        size_t opTypeId() const noexcept final;
+        std::string_view name() const noexcept final;
+        kernel::CollectorBox candidateKernels(Target) const noexcept final;
+        std::string serialize() const noexcept final;
+    };
+
+}// namespace refactor::computation
+
+#endif// #ifndef COMPUTATION_MAT_MUL_INTEGER_H
diff --git a/src/05computation/include/computation/operators/where.h b/src/05computation/include/computation/operators/where.h
index a50a28cfc..5af1296cc 100644
--- a/src/05computation/include/computation/operators/where.h
+++ b/src/05computation/include/computation/operators/where.h
@@ -6,7 +6,8 @@
 namespace refactor::computation {
 
     struct Where final : public Operator {
-        constexpr Where() noexcept : Operator() {}
+
+        constexpr Where() noexcept = default;
 
         static size_t typeId() noexcept;
         size_t opTypeId() const noexcept final;
diff --git a/src/05computation/src/operators/dequantize_linear.cc b/src/05computation/src/operators/dequantize_linear.cc
new file mode 100644
index 000000000..e4c1cf726
--- /dev/null
+++ b/src/05computation/src/operators/dequantize_linear.cc
@@ -0,0 +1,21 @@
+﻿#include "computation/operators/dequantize_linear.h"
+#include "kernel/collectors/dequantize_linear.h"
+
+namespace refactor::computation {
+    using Op = DequantizeLinear;
+
+    size_t Op::typeId() noexcept {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+    size_t Op::opTypeId() const noexcept { return typeId(); }
+    std::string_view Op::name() const noexcept { return "DequantizeLinear"; }
+    auto Op::candidateKernels(Target target) const noexcept -> kernel::CollectorBox {
+        using Collector = kernel::DequantizeLinearCollector;
+        return std::make_unique<Collector>(target);
+    }
+    auto Op::serialize() const noexcept -> std::string {
+        return "DequantizeLinear()";
+    }
+
+}// namespace refactor::computation
diff --git a/src/05computation/src/operators/dynamic_quantize_linear.cc b/src/05computation/src/operators/dynamic_quantize_linear.cc
new file mode 100644
index 000000000..88cff76a3
--- /dev/null
+++ b/src/05computation/src/operators/dynamic_quantize_linear.cc
@@ -0,0 +1,21 @@
+﻿#include "computation/operators/dynamic_quantize_linear.h"
+#include "kernel/collectors/dynamic_quantize_linear.h"
+
+namespace refactor::computation {
+    using Op = DynamicQuantizeLinear;
+
+    size_t Op::typeId() noexcept {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+    size_t Op::opTypeId() const noexcept { return typeId(); }
+    std::string_view Op::name() const noexcept { return "DynamicQuantizeLinear"; }
+    auto Op::candidateKernels(Target target) const noexcept -> kernel::CollectorBox {
+        using Collector = kernel::DynamicQuantizeLinearCollector;
+        return std::make_unique<Collector>(target);
+    }
+    auto Op::serialize() const noexcept -> std::string {
+        return "DynamicQuantizeLinear()";
+    }
+
+}// namespace refactor::computation
diff --git a/src/05computation/src/operators/mat_mul_integer.cc b/src/05computation/src/operators/mat_mul_integer.cc
new file mode 100644
index 000000000..4d03d21b3
--- /dev/null
+++ b/src/05computation/src/operators/mat_mul_integer.cc
@@ -0,0 +1,20 @@
+﻿#include "computation/operators/mat_mul_integer.h"
+#include "kernel/collectors/mat_mul_integer.h"
+
+namespace refactor::computation {
+    using Op = MatMulInteger;
+
+    auto Op::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+    auto Op::opTypeId() const noexcept -> size_t { return typeId(); }
+    auto Op::name() const noexcept -> std::string_view { return "MatMulInteger"; }
+    auto Op::candidateKernels(Target target) const noexcept -> kernel::CollectorBox {
+        return std::make_unique<kernel::MatMulIntegerCollector>(target);
+    }
+    auto Op::serialize() const noexcept -> std::string {
+        return "MatMulInteger()";
+    }
+
+}// namespace refactor::computation
diff --git a/src/07onnx/src/operators.cpp b/src/07onnx/src/operators.cpp
index 62436887d..6e657f272 100644
--- a/src/07onnx/src/operators.cpp
+++ b/src/07onnx/src/operators.cpp
@@ -8,13 +8,17 @@
 #include "operators/constant_of_shape.hh"
 #include "operators/conv.hh"
 #include "operators/cum_sum.hh"
+#include "operators/dequantize_linear.hh"
+#include "operators/dynamic_quantize_linear.hh"
 #include "operators/einsum.hh"
 #include "operators/expand.hh"
+#include "operators/flatten.hh"
 #include "operators/gather.hh"
 #include "operators/gather_elements.hh"
 #include "operators/gemm.hh"
 #include "operators/global_pool.hh"
 #include "operators/mat_mul.hh"
+#include "operators/mat_mul_integer.hh"
 #include "operators/pool.hh"
 #include "operators/range.hh"
 #include "operators/reduce.hh"
@@ -38,84 +42,88 @@ namespace refactor::onnx {
     void register_() {
         // clang-format off
         #define REGISTER(NAME, CLASS) Operator::register_<CLASS>("onnx::" #NAME)
-        REGISTER(BatchNormalization, BatchNormalization);
-        REGISTER(Cast              , Cast              );
-        REGISTER(Clip              , Clip              );
-        REGISTER(Equal             , Compair           );
-        REGISTER(Greater           , Compair           );
-        REGISTER(GreaterOrEqual    , Compair           );
-        REGISTER(Less              , Compair           );
-        REGISTER(LessOrEqual       , Compair           );
-        REGISTER(Concat            , Concat            );
-        REGISTER(Constant          , Constant          );
-        REGISTER(ConstantOfShape   , ConstantOfShape   );
-        REGISTER(Conv              , Conv              );
-        REGISTER(CumSum            , CumSum            );
-        REGISTER(Einsum            , Einsum            );
-        REGISTER(Expand            , Expand            );
-        REGISTER(Gather            , Gather            );
-        REGISTER(GatherElements    , GatherElements    );
-        REGISTER(Gemm              , Gemm              );
-        REGISTER(GlobalAveragePool , GlobalPool        );
-        REGISTER(GlobalLpPool      , GlobalPool        );
-        REGISTER(GlobalMaxPool     , GlobalPool        );
-        REGISTER(MatMul            , MatMul            );
-        REGISTER(AveragePool       , Pool              );
-        REGISTER(LpPool            , Pool              );
-        REGISTER(MaxPool           , Pool              );
-        REGISTER(Range             , Range             );
-        REGISTER(ReduceMean        , Reduce            );
-        REGISTER(ReduceL1          , Reduce            );
-        REGISTER(ReduceL2          , Reduce            );
-        REGISTER(ReduceLogSum      , Reduce            );
-        REGISTER(ReduceLogSumExp   , Reduce            );
-        REGISTER(ReduceMax         , Reduce            );
-        REGISTER(ReduceMin         , Reduce            );
-        REGISTER(ReduceProd        , Reduce            );
-        REGISTER(ReduceSum         , Reduce            );
-        REGISTER(ReduceSumSquare   , Reduce            );
-        REGISTER(Reshape           , Reshape           );
-        REGISTER(ScatterND         , ScatterND         );
-        REGISTER(Max               , Select            );
-        REGISTER(Min               , Select            );
-        REGISTER(Shape             , Shape             );
-        REGISTER(Add               , SimpleBinary      );
-        REGISTER(Sub               , SimpleBinary      );
-        REGISTER(Mul               , SimpleBinary      );
-        REGISTER(Div               , SimpleBinary      );
-        REGISTER(Pow               , SimpleBinary      );
-        REGISTER(And               , SimpleBinary      );
-        REGISTER(Or                , SimpleBinary      );
-        REGISTER(Xor               , SimpleBinary      );
-        REGISTER(Abs               , SimpleUnary       );
-        REGISTER(Acos              , SimpleUnary       );
-        REGISTER(Acosh             , SimpleUnary       );
-        REGISTER(Asin              , SimpleUnary       );
-        REGISTER(Asinh             , SimpleUnary       );
-        REGISTER(Atan              , SimpleUnary       );
-        REGISTER(Atanh             , SimpleUnary       );
-        REGISTER(Cos               , SimpleUnary       );
-        REGISTER(Cosh              , SimpleUnary       );
-        REGISTER(Sin               , SimpleUnary       );
-        REGISTER(Sinh              , SimpleUnary       );
-        REGISTER(Tan               , SimpleUnary       );
-        REGISTER(Tanh              , SimpleUnary       );
-        REGISTER(Relu              , SimpleUnary       );
-        REGISTER(Sqrt              , SimpleUnary       );
-        REGISTER(Sigmoid           , SimpleUnary       );
-        REGISTER(Erf               , SimpleUnary       );
-        REGISTER(Log               , SimpleUnary       );
-        REGISTER(Not               , SimpleUnary       );
-        REGISTER(Neg               , SimpleUnary       );
-        REGISTER(Identity          , SimpleUnary       );
-        REGISTER(Slice             , Slice             );
-        REGISTER(Softmax           , Softmax           );
-        REGISTER(Split             , Split             );
-        REGISTER(Squeeze           , Squeeze           );
-        REGISTER(Tile              , Tile              );
-        REGISTER(Transpose         , Transpose         );
-        REGISTER(Unsqueeze         , Unsqueeze         );
-        REGISTER(Where             , Where             );
+        REGISTER(BatchNormalization   , BatchNormalization   );
+        REGISTER(Cast                 , Cast                 );
+        REGISTER(Clip                 , Clip                 );
+        REGISTER(Equal                , Compair              );
+        REGISTER(Greater              , Compair              );
+        REGISTER(GreaterOrEqual       , Compair              );
+        REGISTER(Less                 , Compair              );
+        REGISTER(LessOrEqual          , Compair              );
+        REGISTER(Concat               , Concat               );
+        REGISTER(Constant             , Constant             );
+        REGISTER(ConstantOfShape      , ConstantOfShape      );
+        REGISTER(Conv                 , Conv                 );
+        REGISTER(DequantizeLinear     , DequantizeLinear     );
+        REGISTER(DynamicQuantizeLinear, DynamicQuantizeLinear);
+        REGISTER(CumSum               , CumSum               );
+        REGISTER(Einsum               , Einsum               );
+        REGISTER(Expand               , Expand               );
+        REGISTER(Gather               , Gather               );
+        REGISTER(GatherElements       , GatherElements       );
+        REGISTER(Gemm                 , Gemm                 );
+        REGISTER(GlobalAveragePool    , GlobalPool           );
+        REGISTER(GlobalLpPool         , GlobalPool           );
+        REGISTER(GlobalMaxPool        , GlobalPool           );
+        REGISTER(MatMul               , MatMul               );
+        REGISTER(MatMulInteger        , MatMulInteger        );
+        REGISTER(AveragePool          , Pool                 );
+        REGISTER(LpPool               , Pool                 );
+        REGISTER(MaxPool              , Pool                 );
+        REGISTER(Range                , Range                );
+        REGISTER(ReduceMean           , Reduce               );
+        REGISTER(ReduceL1             , Reduce               );
+        REGISTER(ReduceL2             , Reduce               );
+        REGISTER(ReduceLogSum         , Reduce               );
+        REGISTER(ReduceLogSumExp      , Reduce               );
+        REGISTER(ReduceMax            , Reduce               );
+        REGISTER(ReduceMin            , Reduce               );
+        REGISTER(ReduceProd           , Reduce               );
+        REGISTER(ReduceSum            , Reduce               );
+        REGISTER(ReduceSumSquare      , Reduce               );
+        REGISTER(Reshape              , Reshape              );
+        REGISTER(Flatten              , Flatten              );
+        REGISTER(ScatterND            , ScatterND            );
+        REGISTER(Max                  , Select               );
+        REGISTER(Min                  , Select               );
+        REGISTER(Shape                , Shape                );
+        REGISTER(Add                  , SimpleBinary         );
+        REGISTER(Sub                  , SimpleBinary         );
+        REGISTER(Mul                  , SimpleBinary         );
+        REGISTER(Div                  , SimpleBinary         );
+        REGISTER(Pow                  , SimpleBinary         );
+        REGISTER(And                  , SimpleBinary         );
+        REGISTER(Or                   , SimpleBinary         );
+        REGISTER(Xor                  , SimpleBinary         );
+        REGISTER(Abs                  , SimpleUnary          );
+        REGISTER(Acos                 , SimpleUnary          );
+        REGISTER(Acosh                , SimpleUnary          );
+        REGISTER(Asin                 , SimpleUnary          );
+        REGISTER(Asinh                , SimpleUnary          );
+        REGISTER(Atan                 , SimpleUnary          );
+        REGISTER(Atanh                , SimpleUnary          );
+        REGISTER(Cos                  , SimpleUnary          );
+        REGISTER(Cosh                 , SimpleUnary          );
+        REGISTER(Sin                  , SimpleUnary          );
+        REGISTER(Sinh                 , SimpleUnary          );
+        REGISTER(Tan                  , SimpleUnary          );
+        REGISTER(Tanh                 , SimpleUnary          );
+        REGISTER(Relu                 , SimpleUnary          );
+        REGISTER(Sqrt                 , SimpleUnary          );
+        REGISTER(Sigmoid              , SimpleUnary          );
+        REGISTER(Erf                  , SimpleUnary          );
+        REGISTER(Log                  , SimpleUnary          );
+        REGISTER(Not                  , SimpleUnary          );
+        REGISTER(Neg                  , SimpleUnary          );
+        REGISTER(Identity             , SimpleUnary          );
+        REGISTER(Slice                , Slice                );
+        REGISTER(Softmax              , Softmax              );
+        REGISTER(Split                , Split                );
+        REGISTER(Squeeze              , Squeeze              );
+        REGISTER(Tile                 , Tile                 );
+        REGISTER(Transpose            , Transpose            );
+        REGISTER(Unsqueeze            , Unsqueeze            );
+        REGISTER(Where                , Where                );
         #undef REGISTER
         // clang-format on
     }
diff --git a/src/07onnx/src/operators/dequantize_linear.cc b/src/07onnx/src/operators/dequantize_linear.cc
new file mode 100644
index 000000000..5dc691513
--- /dev/null
+++ b/src/07onnx/src/operators/dequantize_linear.cc
@@ -0,0 +1,52 @@
+﻿#include "dequantize_linear.hh"
+#include "common.h"
+#include "computation/operators/dequantize_linear.h"
+
+namespace refactor::onnx {
+    using Op = DequantizeLinear;
+
+    Op::DequantizeLinear(Int axis_) noexcept
+        : Operator(), axis(axis_) {}
+
+    auto Op::build(ModelContext const &, std::string_view, Attributes attrs) -> OpBox {
+        auto axis = defaultOr(attrs, "axis", {1}).int_();
+        return OpBox(std::make_unique<Op>(axis));
+    }
+    auto Op::typeId() -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto Op::opTypeId() const -> size_t { return typeId(); }
+    auto Op::opTypeName() const -> std::string_view { return "onnx::DynamicQuantizeLinear"; }
+
+    auto Op::infer(TensorRefs inputs, InferOptions const &options) const -> InferResult {
+        switch (inputs.size()) {
+            case 2:
+            case 3:
+                break;
+            default:
+                return Err(InferError(ERROR_MSG("Input size error")));
+        }
+
+        auto const &x = inputs[0];
+        auto const &xScale = inputs[1];
+        if (xScale.rank() != 0) {
+            return Err(InferError(ERROR_MSG("Only support per-tensor quantization currently")));
+        }
+        if (inputs.size() > 2) {
+            auto const &xZeroPoint = inputs[2];
+            if (xZeroPoint.dataType != x.dataType || xZeroPoint.shape != xScale.shape) {
+                return Err(InferError(ERROR_MSG("x_zero_point info mismatch")));
+            }
+        }
+
+        return Ok(Tensors{Tensor::share(xScale.dataType, x.shape, extractDependency(inputs))});
+    }
+
+    auto Op::lower(TensorRefs inputs) const -> computation::OpBox {
+        using Op_ = computation::DequantizeLinear;
+        return std::make_unique<Op_>();
+    }
+
+}// namespace refactor::onnx
diff --git a/src/07onnx/src/operators/dequantize_linear.hh b/src/07onnx/src/operators/dequantize_linear.hh
new file mode 100644
index 000000000..a24c7bbac
--- /dev/null
+++ b/src/07onnx/src/operators/dequantize_linear.hh
@@ -0,0 +1,25 @@
+﻿#ifndef ONNX_DEQUANTIZE_LINEAR_HH
+#define ONNX_DEQUANTIZE_LINEAR_HH
+
+#include "frontend/operator.h"
+
+namespace refactor::onnx {
+    using namespace frontend;
+
+    struct DequantizeLinear final : public Operator {
+        Int axis;
+
+        explicit DequantizeLinear(Int) noexcept;
+
+        static OpBox build(ModelContext const &, std::string_view, Attributes);
+        static size_t typeId();
+
+        size_t opTypeId() const final;
+        std::string_view opTypeName() const final;
+        InferResult infer(TensorRefs, InferOptions const &) const final;
+        computation::OpBox lower(TensorRefs) const final;
+    };
+
+}// namespace refactor::onnx
+
+#endif// ONNX_DEQUANTIZE_LINEAR_HH
diff --git a/src/07onnx/src/operators/dynamic_quantize_linear.cc b/src/07onnx/src/operators/dynamic_quantize_linear.cc
new file mode 100644
index 000000000..17031607b
--- /dev/null
+++ b/src/07onnx/src/operators/dynamic_quantize_linear.cc
@@ -0,0 +1,39 @@
+﻿#include "dynamic_quantize_linear.hh"
+#include "common.h"
+#include "computation/operators/dynamic_quantize_linear.h"
+
+namespace refactor::onnx {
+    using Op = DynamicQuantizeLinear;
+
+    auto Op::build(ModelContext const &, std::string_view, Attributes) -> OpBox {
+        return OpBox(std::make_unique<Op>());
+    }
+    auto Op::typeId() -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto Op::opTypeId() const -> size_t { return typeId(); }
+    auto Op::opTypeName() const -> std::string_view { return "onnx::DynamicQuantizeLinear"; }
+
+    auto Op::infer(TensorRefs inputs, InferOptions const &options) const -> InferResult {
+        EXPECT_SIZE(1)
+
+        auto const &x = inputs[0];
+        if (x.dataType != DataType::F32) {
+            return Err(InferError(ERROR_MSG("Input data type not support")));
+        }
+        auto deps = extractDependency(inputs);
+        return Ok(Tensors{
+            Tensor::share(DataType::U8, x.shape, deps),
+            Tensor::share(DataType::F32, {}, deps),
+            Tensor::share(DataType::U8, {}, deps),
+        });
+    }
+
+    auto Op::lower(TensorRefs inputs) const -> computation::OpBox {
+        using Op_ = computation::DynamicQuantizeLinear;
+        return std::make_unique<Op_>();
+    }
+
+}// namespace refactor::onnx
diff --git a/src/07onnx/src/operators/dynamic_quantize_linear.hh b/src/07onnx/src/operators/dynamic_quantize_linear.hh
new file mode 100644
index 000000000..899a97695
--- /dev/null
+++ b/src/07onnx/src/operators/dynamic_quantize_linear.hh
@@ -0,0 +1,24 @@
+﻿#ifndef ONNX_DYNAMIC_QUANTIZE_LINEAR_HH
+#define ONNX_DYNAMIC_QUANTIZE_LINEAR_HH
+
+#include "frontend/operator.h"
+
+namespace refactor::onnx {
+    using namespace frontend;
+
+    struct DynamicQuantizeLinear final : public Operator {
+
+        DynamicQuantizeLinear() = default;
+
+        static OpBox build(ModelContext const &, std::string_view, Attributes);
+        static size_t typeId();
+
+        size_t opTypeId() const final;
+        std::string_view opTypeName() const final;
+        InferResult infer(TensorRefs, InferOptions const &) const final;
+        computation::OpBox lower(TensorRefs) const final;
+    };
+
+}// namespace refactor::onnx
+
+#endif// ONNX_DYNAMIC_QUANTIZE_LINEAR_HH
diff --git a/src/07onnx/src/operators/expand.cc b/src/07onnx/src/operators/expand.cc
index 35af90e6f..b5e729810 100644
--- a/src/07onnx/src/operators/expand.cc
+++ b/src/07onnx/src/operators/expand.cc
@@ -6,8 +6,6 @@
 namespace refactor::onnx {
     using Op = Expand;
 
-    Op::Expand() : Operator() {}
-
     auto Op::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox {
         ASSERT(attributes.empty(), "Expand operator should not have attributes");
         return OpBox(std::make_unique<Op>());
diff --git a/src/07onnx/src/operators/expand.hh b/src/07onnx/src/operators/expand.hh
index f621306ba..a4f3b63e1 100644
--- a/src/07onnx/src/operators/expand.hh
+++ b/src/07onnx/src/operators/expand.hh
@@ -7,7 +7,8 @@ namespace refactor::onnx {
     using namespace frontend;
 
     struct Expand final : public Operator {
-        Expand();
+
+        constexpr Expand() noexcept = default;
 
         static OpBox build(ModelContext const &, std::string_view, Attributes);
         static size_t typeId();
diff --git a/src/07onnx/src/operators/mat_mul.cc b/src/07onnx/src/operators/mat_mul.cc
index b9acd44b1..0850ba6f0 100644
--- a/src/07onnx/src/operators/mat_mul.cc
+++ b/src/07onnx/src/operators/mat_mul.cc
@@ -5,8 +5,6 @@
 namespace refactor::onnx {
     using Op = MatMul;
 
-    Op::MatMul() : Operator() {}
-
     auto Op::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox {
         ASSERT(attributes.empty(), "MatMul operator should not have attributes");
         return OpBox(std::make_unique<Op>());
diff --git a/src/07onnx/src/operators/mat_mul.hh b/src/07onnx/src/operators/mat_mul.hh
index d0ce5fcdd..e845de275 100644
--- a/src/07onnx/src/operators/mat_mul.hh
+++ b/src/07onnx/src/operators/mat_mul.hh
@@ -7,7 +7,8 @@ namespace refactor::onnx {
     using namespace frontend;
 
     struct MatMul final : public Operator {
-        MatMul();
+
+        constexpr MatMul() noexcept = default;
 
         static OpBox build(ModelContext const &, std::string_view, Attributes);
         static size_t typeId();
diff --git a/src/07onnx/src/operators/mat_mul_integer.cc b/src/07onnx/src/operators/mat_mul_integer.cc
new file mode 100644
index 000000000..de5de0525
--- /dev/null
+++ b/src/07onnx/src/operators/mat_mul_integer.cc
@@ -0,0 +1,102 @@
+﻿#include "mat_mul_integer.hh"
+#include "common.h"
+#include "computation/operators/mat_mul_integer.h"
+#include <unordered_set>
+
+namespace refactor::onnx {
+    using Op = MatMulInteger;
+
+    auto Op::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox {
+        ASSERT(attributes.empty(), "MatMulInteger operator should not have attributes");
+        return OpBox(std::make_unique<Op>());
+    }
+    auto Op::typeId() -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto Op::opTypeId() const -> size_t { return typeId(); }
+    auto Op::opTypeName() const -> std::string_view { return "onnx::MatMulInteger"; }
+
+    static bool checkZeroPoint(TensorRefs inputs, size_t scalarN) {
+        if (inputs.size() <= scalarN + 2) {
+            return true;
+        }
+        auto const &t = inputs[scalarN];
+        auto const &zp = inputs[scalarN + 2];
+        if (zp.dataType != t.dataType) {
+            return false;
+        }
+        if (zp.rank() == 0) {
+            return true;
+        }
+        switch (t.rank()) {
+            case 1:
+                return zp.shape == decltype(zp.shape){DimExpr(1)};
+            case 2:
+                return zp.shape == decltype(zp.shape){t.shape[scalarN]};
+            default: {
+                auto expect = t.shape;
+                expect[expect.size() - 1 - scalarN] = DimExpr(1);
+                return zp.shape == expect;
+            }
+        }
+    }
+
+    auto Op::infer(TensorRefs inputs, InferOptions const &options) const -> InferResult {
+        static const std::unordered_set<decltype(DataType::internal)>
+            SET{DataType::I8, DataType::U8};
+
+        if (inputs.size() < 2 || 4 < inputs.size()) {
+            return Err(InferError(ERROR_MSG("Input size not support")));
+        }
+        if (!checkZeroPoint(inputs, 0) || !checkZeroPoint(inputs, 1)) {
+            return Err(InferError(ERROR_MSG("Input zero point not support")));
+        }
+
+        auto const &a = inputs[0];
+        auto const &b = inputs[1];
+        if (!SET.contains(a.dataType) || !SET.contains(b.dataType)) {
+            return Err(InferError(ERROR_MSG("Input data type not support")));
+        }
+        auto sa = a.shape, sb = b.shape;
+        switch (sa.size()) {
+            case 1:
+                sa.emplace(sa.begin(), 1);
+                break;
+            case 0:
+                return Err(InferError(ERROR_MSG("Input shape not support")));
+            default:
+                break;
+        }
+        switch (sb.size()) {
+            case 1:
+                sb.emplace_back(1);
+                break;
+            case 0:
+                return Err(InferError(ERROR_MSG("Input shape not support")));
+            default:
+                break;
+        }
+        auto k = sa.back();
+        sa.pop_back();
+        auto m = sa.back();
+        sa.pop_back();
+        auto n = sb.back();
+        sb.pop_back();
+        if (k != sb.back()) {
+            return Err(InferError(ERROR_MSG("Input shape not support")));
+        }
+        sb.pop_back();
+        MULTIDIR_BROADCAST((ShapeRefs{sa, sb}))
+        output.emplace_back(std::move(m));
+        output.emplace_back(std::move(n));
+        return Ok(Tensors{Tensor::share(DataType::I32, std::move(output), extractDependency(inputs))});
+    }
+
+    auto Op::lower(TensorRefs) const -> computation::OpBox {
+        using Op_ = computation::MatMulInteger;
+        return std::make_unique<Op_>();
+    }
+
+}// namespace refactor::onnx
diff --git a/src/07onnx/src/operators/mat_mul_integer.hh b/src/07onnx/src/operators/mat_mul_integer.hh
new file mode 100644
index 000000000..1d6d164a7
--- /dev/null
+++ b/src/07onnx/src/operators/mat_mul_integer.hh
@@ -0,0 +1,24 @@
+﻿#ifndef ONNX_MAT_MUL_INTEGER_HH
+#define ONNX_MAT_MUL_INTEGER_HH
+
+#include "frontend/operator.h"
+
+namespace refactor::onnx {
+    using namespace frontend;
+
+    struct MatMulInteger final : public Operator {
+
+        constexpr MatMulInteger() noexcept = default;
+
+        static OpBox build(ModelContext const &, std::string_view, Attributes);
+        static size_t typeId();
+
+        size_t opTypeId() const final;
+        std::string_view opTypeName() const final;
+        InferResult infer(TensorRefs, InferOptions const &) const final;
+        computation::OpBox lower(TensorRefs) const final;
+    };
+
+}// namespace refactor::onnx
+
+#endif// ONNX_MAT_MUL_INTEGER_HH
diff --git a/src/07onnx/src/operators/range.cc b/src/07onnx/src/operators/range.cc
index 8f4ce357a..d5191734b 100644
--- a/src/07onnx/src/operators/range.cc
+++ b/src/07onnx/src/operators/range.cc
@@ -4,8 +4,6 @@
 namespace refactor::onnx {
     using Op = Range;
 
-    Op::Range() : Operator() {}
-
     auto Op::build(ModelContext const &, std::string_view, Attributes) -> OpBox {
         return OpBox(std::make_unique<Op>());
     }
diff --git a/src/07onnx/src/operators/range.hh b/src/07onnx/src/operators/range.hh
index baf29aa20..dcf17462a 100644
--- a/src/07onnx/src/operators/range.hh
+++ b/src/07onnx/src/operators/range.hh
@@ -7,7 +7,8 @@ namespace refactor::onnx {
     using namespace frontend;
 
     struct Range final : public Operator {
-        Range();
+
+        constexpr Range() noexcept = default;
 
         static OpBox build(ModelContext const &, std::string_view, Attributes);
         static size_t typeId();
diff --git a/src/07onnx/src/operators/slice.cc b/src/07onnx/src/operators/slice.cc
index 49540e245..0a0853a85 100644
--- a/src/07onnx/src/operators/slice.cc
+++ b/src/07onnx/src/operators/slice.cc
@@ -7,8 +7,6 @@ namespace refactor::onnx {
     using computation::Dimensions;
     using Op = Slice;
 
-    Op::Slice() : Operator() {}
-
     auto Op::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox {
         ASSERT(attributes.empty(), "Slice operator should not have attributes");
         return OpBox(std::make_unique<Op>());
diff --git a/src/07onnx/src/operators/slice.hh b/src/07onnx/src/operators/slice.hh
index 8afa0067b..3a082a1d5 100644
--- a/src/07onnx/src/operators/slice.hh
+++ b/src/07onnx/src/operators/slice.hh
@@ -7,7 +7,8 @@ namespace refactor::onnx {
     using namespace frontend;
 
     struct Slice final : public Operator {
-        Slice();
+
+        constexpr Slice() noexcept = default;
 
         static OpBox build(ModelContext const &, std::string_view, Attributes);
         static size_t typeId();
diff --git a/src/07onnx/src/operators/tile.cc b/src/07onnx/src/operators/tile.cc
index a1fa4b293..14a13fb2f 100644
--- a/src/07onnx/src/operators/tile.cc
+++ b/src/07onnx/src/operators/tile.cc
@@ -5,8 +5,6 @@
 namespace refactor::onnx {
     using Op = Tile;
 
-    Op::Tile() : Operator() {}
-
     auto Op::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox {
         ASSERT(attributes.empty(), "Tile operator should not have attributes");
         return OpBox(std::make_unique<Op>());
diff --git a/src/07onnx/src/operators/tile.hh b/src/07onnx/src/operators/tile.hh
index 1d4e60e32..9fed86358 100644
--- a/src/07onnx/src/operators/tile.hh
+++ b/src/07onnx/src/operators/tile.hh
@@ -8,7 +8,7 @@ namespace refactor::onnx {
 
     struct Tile final : public Operator {
 
-        Tile();
+        constexpr Tile() noexcept = default;
 
         static OpBox build(ModelContext const &, std::string_view, Attributes);
         static size_t typeId();
diff --git a/src/07onnx/src/operators/where.cc b/src/07onnx/src/operators/where.cc
index 757ba0c78..eea30e691 100644
--- a/src/07onnx/src/operators/where.cc
+++ b/src/07onnx/src/operators/where.cc
@@ -6,8 +6,6 @@
 namespace refactor::onnx {
     using Op = Where;
 
-    Op::Where() : Operator() {}
-
     auto Op::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox {
         ASSERT(attributes.empty(), "Where operator should not have attributes");
         return OpBox(std::make_unique<Op>());
diff --git a/src/07onnx/src/operators/where.hh b/src/07onnx/src/operators/where.hh
index defd0af0b..9c0d9ad68 100644
--- a/src/07onnx/src/operators/where.hh
+++ b/src/07onnx/src/operators/where.hh
@@ -8,7 +8,7 @@ namespace refactor::onnx {
 
     struct Where final : public Operator {
 
-        Where();
+        constexpr Where() noexcept = default;
 
         static OpBox build(ModelContext const &, std::string_view, Attributes);
         static size_t typeId();
diff --git a/src/08communication/src/operators/all_reduce.cc b/src/08communication/src/operators/all_reduce.cc
index c508b7b17..989ed6ad9 100644
--- a/src/08communication/src/operators/all_reduce.cc
+++ b/src/08communication/src/operators/all_reduce.cc
@@ -4,8 +4,6 @@
 namespace refactor::communication {
     using Op = AllReduce;
 
-    Op::AllReduce() : Operator() {}
-
     auto Op::build(ModelContext const &, std::string_view, Attributes) -> OpBox {
         return OpBox(std::make_unique<Op>());
     }
diff --git a/src/08communication/src/operators/all_reduce.hh b/src/08communication/src/operators/all_reduce.hh
index a31d6f2f6..a14bc74f4 100644
--- a/src/08communication/src/operators/all_reduce.hh
+++ b/src/08communication/src/operators/all_reduce.hh
@@ -8,7 +8,7 @@ namespace refactor::communication {
 
     struct AllReduce final : public Operator {
 
-        AllReduce();
+        constexpr AllReduce() noexcept = default;
 
         static OpBox build(ModelContext const &, std::string_view, Attributes);
         static size_t typeId();