diff --git a/src/packing.cc b/src/packing.cc
index 1bd7912db06..a52a065b6f9 100644
--- a/src/packing.cc
+++ b/src/packing.cc
@@ -41,12 +41,32 @@ struct unaligned_int32_t {
   XNN_INLINE operator int32_t() const { return value; }  // NOLINT: emulating __attribute__((__aligned__(x))) int32_t.
 };
 
+template <typename Src, typename Dst>
+void copy_n(const Src* src, size_t n, Dst* dst) {
+  for (size_t i = 0; i < n; ++i) {
+    dst[i] = src[i];
+  }
+}
+
+template <typename Src, typename Dst>
+int32_t copy_n_and_sum(const Src* src, size_t n, Dst* dst) {
+  int32_t sum = 0;
+  for (size_t i = 0; i < n; ++i) {
+    const auto v = src[i];
+    sum += (int32_t) v;
+    dst[i] = v;
+  }
+  return sum;
+}
+
 template <typename Src, typename Dst>
 void copy_bias(const Src* b, size_t b_offset, size_t n, Dst* packed_b) {
   if (b) {
-    std::copy_n(b + b_offset, n, packed_b);
+    copy_n(b + b_offset, n, packed_b);
   } else {
-    std::fill_n(packed_b, n, 0);
+    for (size_t i = 0; i < n; ++i) {
+      packed_b[i] = 0;
+    }
   }
 }
 
@@ -54,22 +74,13 @@ template <typename Src, typename Dst>
 void copy_bias(const Src* b, size_t b_offset, size_t n, Dst* packed_b, Src zero_point) {
   if (b) {
     for (size_t i = 0; i < n; ++i) {
-      *packed_b++ = zero_point + b[b_offset + i];
+      packed_b[i] = zero_point + b[b_offset + i];
     }
   } else {
-    std::fill_n(packed_b, n, zero_point);
-  }
-}
-
-template <typename Src, typename Dst>
-int32_t copy_n_and_sum(const Src* src, size_t n, Dst* dst) {
-  int32_t sum = 0;
-  for (size_t i = 0; i < n; ++i) {
-    const auto v = *src++;
-    sum += (int32_t) v;
-    *dst++ = v;
+    for (size_t i = 0; i < n; ++i) {
+      packed_b[i] = zero_point;
+    }
   }
-  return sum;
 }
 
 extern "C" {
@@ -105,7 +116,7 @@ void xnn_pack_f32_gemm_goi_w(
           const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
           const size_t kc_end = std::min(kc, kc_begin + kr);
           if (kc_begin < kc_end) {
-            std::copy_n(&k[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, packed_weights);
+            copy_n(&k[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, packed_weights);
           }
           packed_weights += kr;
         }
@@ -151,7 +162,7 @@ void xnn_pack_f16_gemm_goi_w(
           const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
           const size_t kc_end = std::min(kc, kc_begin + kr);
           if (kc_begin < kc_end) {
-            std::copy_n(&k[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, packed_weights);
+            copy_n(&k[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, packed_weights);
           }
           packed_weights += kr;
         }
@@ -197,7 +208,7 @@ void xnn_pack_f32_to_f16_gemm_goi_w(
           const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
           const size_t kc_end = std::min(kc, kc_begin + kr);
           if (kc_begin < kc_end) {
-            std::copy_n(&k[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, packed_weights);
+            copy_n(&k[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, packed_weights);
           }
           packed_weights += kr;
         }
@@ -934,7 +945,7 @@ void xnn_pack_f32_qs8w_gemm_goi_w(
           const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
           const size_t kc_end = std::min(kc, kc_begin + kr);
           if (kc_begin < kc_end) {
-            std::copy_n(&k[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, (int8_t*) packed_weights);
+            copy_n(&k[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, (int8_t*) packed_weights);
           }
           packed_weights = (int8_t*) packed_weights + kr;
         }
@@ -985,7 +996,7 @@ void xnn_pack_f32_qc4w_gemm_goi_w(
           const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
           const size_t kc_end = std::min(kc, kc_begin + kr);
           if (kc_begin < kc_end) {
-            std::copy_n(&((const uint8_t*) k)[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, (uint8_t*) packed_weights);
+            copy_n(&((const uint8_t*) k)[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, (uint8_t*) packed_weights);
           }
           packed_weights = (uint8_t*) packed_weights + kr;
         }
@@ -1022,7 +1033,7 @@ void xnn_pack_f32_gemm_gio_w(size_t g, size_t nc, size_t kc, size_t nr,
         for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start++) {
           const size_t kc_idx = round_down_po2(kr_block_start, skr);
           if (kc_idx < kc) {
-            std::copy_n(&k[kc_idx * k_stride + nr_block_start], nr_block_size,
+            copy_n(&k[kc_idx * k_stride + nr_block_start], nr_block_size,
                         packed_weights);
           }
           packed_weights += nr;
@@ -1080,7 +1091,7 @@ void xnn_pack_f16_gemm_gio_w(size_t g, size_t nc, size_t kc, size_t nr,
         for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start++) {
           const size_t kc_idx = round_down_po2(kr_block_start, skr);
           if (kc_idx < kc) {
-            std::copy_n(&k[kc_idx * k_stride + nr_block_start], nr_block_size,
+            copy_n(&k[kc_idx * k_stride + nr_block_start], nr_block_size,
                         packed_weights);
           }
           packed_weights += nr;
@@ -1773,7 +1784,7 @@ void xnn_pack_f32_conv_goki_w(
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
       if XNN_LIKELY(b != nullptr) {
-        std::copy_n(&b[nr_block_start], nr_block_size, packed_weights);
+        copy_n(&b[nr_block_start], nr_block_size, packed_weights);
       }
       packed_weights += nr;
 
@@ -1783,7 +1794,7 @@ void xnn_pack_f32_conv_goki_w(
             const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
             const size_t kc_end = std::min(kc, kc_begin + kr);
             if (kc_begin < kc_end) {
-              std::copy_n(&k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_begin], kc_end - kc_begin, packed_weights);
+              copy_n(&k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_begin], kc_end - kc_begin, packed_weights);
             }
             packed_weights += kr;
           }
@@ -1824,7 +1835,7 @@ void xnn_pack_f16_conv_goki_w(
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
       if XNN_LIKELY(b != nullptr) {
-        std::copy_n(&b[nr_block_start], nr_block_size, packed_weights);
+        copy_n(&b[nr_block_start], nr_block_size, packed_weights);
       }
       packed_weights += nr;
 
@@ -1834,7 +1845,7 @@ void xnn_pack_f16_conv_goki_w(
             const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
             const size_t kc_end = std::min(kc, kc_begin + kr);
             if (kc_begin < kc_end) {
-              std::copy_n(&k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_begin], kc_end - kc_begin, packed_weights);
+              copy_n(&k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_begin], kc_end - kc_begin, packed_weights);
             }
             packed_weights += kr;
           }
@@ -1875,7 +1886,7 @@ void xnn_pack_f32_to_f16_conv_goki_w(
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
       if XNN_LIKELY(b != nullptr) {
-        std::copy_n(&b[nr_block_start], nr_block_size, packed_weights);
+        copy_n(&b[nr_block_start], nr_block_size, packed_weights);
       }
       packed_weights += nr;
 
@@ -1885,7 +1896,7 @@ void xnn_pack_f32_to_f16_conv_goki_w(
             const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
             const size_t kc_end = std::min(kc, kc_begin + kr);
             if (kc_begin < kc_end) {
-              std::copy_n(&k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_begin], kc_end - kc_begin, packed_weights);
+              copy_n(&k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_begin], kc_end - kc_begin, packed_weights);
             }
             packed_weights += kr;
           }
@@ -2081,7 +2092,7 @@ void xnn_pack_f32_conv_kgo_w(
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
       if XNN_LIKELY(b != nullptr) {
-        std::copy_n(&b[nr_block_start], nr_block_size, packed_weights);
+        copy_n(&b[nr_block_start], nr_block_size, packed_weights);
       }
       packed_weights += nr;
 
@@ -2125,7 +2136,7 @@ void xnn_pack_f16_conv_kgo_w(
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
       if XNN_LIKELY(b != nullptr) {
-        std::copy_n(&b[nr_block_start], nr_block_size, packed_weights);
+        copy_n(&b[nr_block_start], nr_block_size, packed_weights);
       }
       packed_weights += nr;
 
@@ -2169,7 +2180,7 @@ void xnn_pack_f32_to_f16_conv_kgo_w(
     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
       const size_t nr_block_size = min(nc - nr_block_start, nr);
       if XNN_LIKELY(b != nullptr) {
-        std::copy_n(&b[nr_block_start], nr_block_size, packed_weights);
+        copy_n(&b[nr_block_start], nr_block_size, packed_weights);
       }
       packed_weights += nr;
 
@@ -2354,7 +2365,7 @@ void xnn_pack_f32_deconv_goki_w(
         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
           const size_t nr_block_size = min(nc - nr_block_start, nr);
           if XNN_LIKELY(b != nullptr) {
-            std::copy_n(&b[nr_block_start], nr_block_size, packed_weights);
+            copy_n(&b[nr_block_start], nr_block_size, packed_weights);
           }
           packed_weights += nr;
           for (size_t ky = oy; ky < kh; ky += sh) {
@@ -2364,7 +2375,7 @@ void xnn_pack_f32_deconv_goki_w(
                   const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
                   const size_t kc_end = std::min(kc, kc_begin + kr);
                   if (kc_begin < kc_end) {
-                    std::copy_n(&k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_begin], kc_end - kc_begin, packed_weights);
+                    copy_n(&k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_begin], kc_end - kc_begin, packed_weights);
                   }
                   packed_weights += kr;
                 }
@@ -2417,7 +2428,7 @@ void xnn_pack_f16_deconv_goki_w(
         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
           const size_t nr_block_size = min(nc - nr_block_start, nr);
           if XNN_LIKELY(b != nullptr) {
-            std::copy_n(&b[nr_block_start], nr_block_size, packed_weights);
+            copy_n(&b[nr_block_start], nr_block_size, packed_weights);
           }
           packed_weights += nr;
           for (size_t ky = oy; ky < kh; ky += sh) {
@@ -2427,7 +2438,7 @@ void xnn_pack_f16_deconv_goki_w(
                   const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
                   const size_t kc_end = std::min(kc, kc_begin + kr);
                   if (kc_begin < kc_end) {
-                    std::copy_n(&k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_begin], kc_end - kc_begin, packed_weights);
+                    copy_n(&k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_begin], kc_end - kc_begin, packed_weights);
                   }
                   packed_weights += kr;
                 }
@@ -2480,7 +2491,7 @@ void xnn_pack_f32_to_f16_deconv_goki_w(
         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
           const size_t nr_block_size = min(nc - nr_block_start, nr);
           if XNN_LIKELY(b != nullptr) {
-            std::copy_n(&b[nr_block_start], nr_block_size, packed_weights);
+            copy_n(&b[nr_block_start], nr_block_size, packed_weights);
           }
           packed_weights += nr;
           for (size_t ky = oy; ky < kh; ky += sh) {
@@ -2490,7 +2501,7 @@ void xnn_pack_f32_to_f16_deconv_goki_w(
                   const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
                   const size_t kc_end = std::min(kc, kc_begin + kr);
                   if (kc_begin < kc_end) {
-                    std::copy_n(&k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_begin], kc_end - kc_begin, packed_weights);
+                    copy_n(&k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_begin], kc_end - kc_begin, packed_weights);
                   }
                   packed_weights += kr;
                 }
@@ -4585,7 +4596,7 @@ void xnn_pack_f32_gemminc_goi_w(
             const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
             const size_t kc_end = std::min(kc, kc_begin + kr);
             if (kc_begin < kc_end) {
-              std::copy_n(&k[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, packed_weights);
+              copy_n(&k[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, packed_weights);
             }
           }
           packed_weights += kr;
@@ -4624,7 +4635,7 @@ void xnn_pack_f16_gemminc_goi_w(
             const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
             const size_t kc_end = std::min(kc, kc_begin + kr);
             if (kc_begin < kc_end) {
-              std::copy_n(&k[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, packed_weights);
+              copy_n(&k[(nr_block_start + nr_block_offset) * kc + kc_begin], kc_end - kc_begin, packed_weights);
             }
           }
           packed_weights += kr;