diff --git a/src/layer/arm/quantize_arm.cpp b/src/layer/arm/quantize_arm.cpp
index 6e395a9bb76..46b44d104c3 100644
--- a/src/layer/arm/quantize_arm.cpp
+++ b/src/layer/arm/quantize_arm.cpp
@@ -39,6 +39,59 @@ Quantize_arm::Quantize_arm()
 #endif
 }
 
+static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack)
+{
+    const int scale_data_size = scale_data.w;
+    const int size = elemcount * elempack;
+
+    // NCNN_LOGE("quantize %d   %d %d", scale_data_size, elemcount, elempack);
+
+    float scale = scale_data[0];
+#if __ARM_NEON
+    float32x4_t _scale = vdupq_n_f32(scale);
+    if (scale_data_size > 1)
+    {
+        if (elempack == 4)
+        {
+            _scale = vld1q_f32((const float*)scale_data);
+        }
+    }
+#endif // __ARM_NEON
+
+    int i = 0;
+#if __ARM_NEON
+    for (; i + 7 < size; i += 8)
+    {
+        float32x4_t _v0 = vld1q_f32(ptr);
+        float32x4_t _v1 = vld1q_f32(ptr + 4);
+        _v0 = vmulq_f32(_v0, _scale);
+        _v1 = vmulq_f32(_v1, _scale);
+        vst1_s8(s8ptr, float2int8(_v0, _v1));
+        ptr += 8;
+        s8ptr += 8;
+    }
+    for (; i + 3 < size; i += 4)
+    {
+        float32x4_t _v = vld1q_f32(ptr);
+        _v = vmulq_f32(_v, _scale);
+        int8x8_t v = float2int8(_v, _v);
+        s8ptr[0] = vget_lane_s8(v, 0);
+        s8ptr[1] = vget_lane_s8(v, 1);
+        s8ptr[2] = vget_lane_s8(v, 2);
+        s8ptr[3] = vget_lane_s8(v, 3);
+        ptr += 4;
+        s8ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        float v = *ptr * scale;
+        *s8ptr = float2int8(v);
+        ptr++;
+        s8ptr++;
+    }
+}
+
 int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int elembits = bottom_blob.elembits();
@@ -58,404 +111,59 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
         return forward_bf16s(bottom_blob, top_blob, opt);
 #endif
 
-    int dims = bottom_blob.dims;
-    int elempack = bottom_blob.elempack;
-
-#if __ARM_NEON
-    if (elempack == 4)
-    {
-        if (dims == 1)
-        {
-            int w = bottom_blob.w;
-            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
-            int outw = w * elempack / out_elempack;
-
-            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-                const float scale = scale_data[0];
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const float* ptr0 = (const float*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8(ptr0[0] * scale);
-                    outptr[1] = float2int8(ptr0[1] * scale);
-                    outptr[2] = float2int8(ptr0[2] * scale);
-                    outptr[3] = float2int8(ptr0[3] * scale);
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const float* ptr0 = (const float*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]);
-                    outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]);
-                    outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]);
-                    outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]);
-                }
-            }
-        }
-
-        if (dims == 2)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
-            int outh = h * elempack / out_elempack;
-
-            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (out_elempack == 8)
-            {
-                if (scale_data_size == 1)
-                {
-                    float32x4_t _scale = vdupq_n_f32(scale_data[0]);
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < outh; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i * 2);
-                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
-                        signed char* outptr = top_blob.row<signed char>(i);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            float32x4_t _vlow = vld1q_f32(ptr0);
-                            float32x4_t _vhigh = vld1q_f32(ptr1);
-                            _vlow = vmulq_f32(_vlow, _scale);
-                            _vhigh = vmulq_f32(_vhigh, _scale);
-                            int8x8_t _v = float2int8(_vlow, _vhigh);
-                            vst1_s8(outptr, _v);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < outh; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i * 2);
-                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
-                        signed char* outptr = top_blob.row<signed char>(i);
-
-                        float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8);
-                        float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            float32x4_t _vlow = vld1q_f32(ptr0);
-                            float32x4_t _vhigh = vld1q_f32(ptr1);
-                            _vlow = vmulq_f32(_vlow, _scale0);
-                            _vhigh = vmulq_f32(_vhigh, _scale1);
-                            int8x8_t _v = float2int8(_vlow, _vhigh);
-                            vst1_s8(outptr, _v);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-            }
-            if (out_elempack == 1)
-            {
-                if (scale_data_size == 1)
-                {
-                    const float scale = scale_data[0];
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < h; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i);
-                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * scale);
-                            outptr1[0] = float2int8(ptr0[1] * scale);
-                            outptr2[0] = float2int8(ptr0[2] * scale);
-                            outptr3[0] = float2int8(ptr0[3] * scale);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < h; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i);
-                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                        const float s0 = scale_data[i * 4];
-                        const float s1 = scale_data[i * 4 + 1];
-                        const float s2 = scale_data[i * 4 + 2];
-                        const float s3 = scale_data[i * 4 + 3];
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * s0);
-                            outptr1[0] = float2int8(ptr0[1] * s1);
-                            outptr2[0] = float2int8(ptr0[2] * s2);
-                            outptr3[0] = float2int8(ptr0[3] * s3);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-            }
-        }
-
-        if (dims == 3)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int channels = bottom_blob.c;
-            int size = w * h;
-            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
-            int outc = channels * elempack / out_elempack;
-
-            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (out_elempack == 8)
-            {
-                if (scale_data_size == 1)
-                {
-                    float32x4_t _scale = vdupq_n_f32(scale_data[0]);
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < outc; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q * 2);
-                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
-                        signed char* outptr = top_blob.channel(q);
-
-                        int i = 0;
-                        for (; i + 1 < size; i += 2)
-                        {
-                            float32x4_t _v0 = vld1q_f32(ptr0);
-                            float32x4_t _v1 = vld1q_f32(ptr0 + 4);
-                            float32x4_t _v2 = vld1q_f32(ptr1);
-                            float32x4_t _v3 = vld1q_f32(ptr1 + 4);
-                            _v0 = vmulq_f32(_v0, _scale);
-                            _v1 = vmulq_f32(_v1, _scale);
-                            _v2 = vmulq_f32(_v2, _scale);
-                            _v3 = vmulq_f32(_v3, _scale);
-                            vst1_s8(outptr, float2int8(_v0, _v2));
-                            vst1_s8(outptr + 8, float2int8(_v1, _v3));
-
-                            ptr0 += 8;
-                            ptr1 += 8;
-                            outptr += 16;
-                        }
-                        for (; i < size; i++)
-                        {
-                            float32x4_t _vlow = vld1q_f32(ptr0);
-                            float32x4_t _vhigh = vld1q_f32(ptr1);
-                            _vlow = vmulq_f32(_vlow, _scale);
-                            _vhigh = vmulq_f32(_vhigh, _scale);
-                            int8x8_t _v = float2int8(_vlow, _vhigh);
-                            vst1_s8(outptr, _v);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < outc; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q * 2);
-                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
-                        signed char* outptr = top_blob.channel(q);
-
-                        float32x4_t _scale0 = vld1q_f32((const float*)scale_data + q * 8);
-                        float32x4_t _scale1 = vld1q_f32((const float*)scale_data + q * 8 + 4);
-
-                        int i = 0;
-                        for (; i < size; i++)
-                        {
-                            float32x4_t _vlow = vld1q_f32(ptr0);
-                            float32x4_t _vhigh = vld1q_f32(ptr1);
-                            _vlow = vmulq_f32(_vlow, _scale0);
-                            _vhigh = vmulq_f32(_vhigh, _scale1);
-                            int8x8_t _v = float2int8(_vlow, _vhigh);
-                            vst1_s8(outptr, _v);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-            }
-            if (out_elempack == 1)
-            {
-                if (scale_data_size == 1)
-                {
-                    const float scale = scale_data[0];
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < channels; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q);
-                        signed char* outptr0 = top_blob.channel(q * 4);
-                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * scale);
-                            outptr1[0] = float2int8(ptr0[1] * scale);
-                            outptr2[0] = float2int8(ptr0[2] * scale);
-                            outptr3[0] = float2int8(ptr0[3] * scale);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < channels; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q);
-                        signed char* outptr0 = top_blob.channel(q * 4);
-                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                        const float s0 = scale_data[q * 4];
-                        const float s1 = scale_data[q * 4 + 1];
-                        const float s2 = scale_data[q * 4 + 2];
-                        const float s3 = scale_data[q * 4 + 3];
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * s0);
-                            outptr1[0] = float2int8(ptr0[1] * s1);
-                            outptr2[0] = float2int8(ptr0[2] * s2);
-                            outptr3[0] = float2int8(ptr0[3] * s3);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-            }
-        }
-
-        return 0;
-    }
-#endif // __ARM_NEON
+    const int dims = bottom_blob.dims;
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+    const size_t out_elemsize = elempack * 1u;
 
     if (dims == 1)
     {
-        int w = bottom_blob.w;
-
-        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        const float* ptr = bottom_blob;
-        signed char* outptr = top_blob;
+        const int wp = std::max(1, w / opt.num_threads);
+        const int nn_w = (w + wp - 1) / wp;
 
-        if (scale_data_size == 1)
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_w; ii++)
         {
-            const float scale = scale_data[0];
+            const int i = ii * wp;
 
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8(ptr[i] * scale);
-            }
-        }
-        else
-        {
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8(ptr[i] * scale_data[i]);
-            }
+            const float* ptr = (const float*)bottom_blob + i * elempack;
+            signed char* s8ptr = (signed char*)top_blob + i * elempack;
+
+            // assert scale_data_size == 1
+
+            const int size = std::min(w - i, wp) * elempack;
+
+            quantize(ptr, s8ptr, scale_data, size, 1);
         }
     }
 
     if (dims == 2)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-
-        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = 0; i < h; i++)
         {
-            const float* ptr0 = bottom_blob.row(i);
-            signed char* outptr0 = top_blob.row<signed char>(i);
+            const float* ptr = bottom_blob.row(i);
+            signed char* s8ptr = top_blob.row<signed char>(i);
 
-            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+            const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
 
-            for (int j = 0; j < w; j++)
-            {
-                *outptr0++ = float2int8(*ptr0++ * scale);
-            }
+            quantize(ptr, s8ptr, scale_data_i, w, elempack);
         }
     }
 
     if (dims == 3)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-        int channels = bottom_blob.c;
-        int size = w * h;
-
-        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -463,46 +171,11 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
         for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
-            signed char* outptr = top_blob.channel(q);
+            signed char* s8ptr = top_blob.channel(q);
 
-            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+            const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
 
-            int i = 0;
-#if __ARM_NEON
-            float32x4_t _scale = vdupq_n_f32(scale);
-            for (; i + 15 < size; i += 16)
-            {
-                float32x4_t _v0 = vld1q_f32(ptr);
-                float32x4_t _v1 = vld1q_f32(ptr + 4);
-                float32x4_t _v2 = vld1q_f32(ptr + 8);
-                float32x4_t _v3 = vld1q_f32(ptr + 12);
-                _v0 = vmulq_f32(_v0, _scale);
-                _v1 = vmulq_f32(_v1, _scale);
-                _v2 = vmulq_f32(_v2, _scale);
-                _v3 = vmulq_f32(_v3, _scale);
-                vst1_s8(outptr, float2int8(_v0, _v1));
-                vst1_s8(outptr + 8, float2int8(_v2, _v3));
-
-                ptr += 16;
-                outptr += 16;
-            }
-            for (; i + 7 < size; i += 8)
-            {
-                float32x4_t _v0 = vld1q_f32(ptr);
-                float32x4_t _v1 = vld1q_f32(ptr + 4);
-                _v0 = vmulq_f32(_v0, _scale);
-                _v1 = vmulq_f32(_v1, _scale);
-                int8x8_t _v = float2int8(_v0, _v1);
-                vst1_s8(outptr, _v);
-
-                ptr += 8;
-                outptr += 8;
-            }
-#endif // __ARM_NEON
-            for (; i < size; i++)
-            {
-                *outptr++ = float2int8(*ptr++ * scale);
-            }
+            quantize(ptr, s8ptr, scale_data_q, w * h, elempack);
         }
     }
 
@@ -510,387 +183,115 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
 }
 
 #if NCNN_BF16
-int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+static void quantize_bf16s(const unsigned short* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack)
 {
-    int dims = bottom_blob.dims;
-    int elempack = bottom_blob.elempack;
+    const int scale_data_size = scale_data.w;
+    const int size = elemcount * elempack;
+
+    // NCNN_LOGE("quantize_bf16s %d   %d %d", scale_data_size, elemcount, elempack);
 
+    float scale = scale_data[0];
 #if __ARM_NEON
-    if (elempack == 4)
+    float32x4_t _scale = vdupq_n_f32(scale);
+    if (scale_data_size > 1)
     {
-        if (dims == 1)
+        if (elempack == 4)
         {
-            int w = bottom_blob.w;
-            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
-            int outw = w * elempack / out_elempack;
-
-            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-                const float scale = scale_data[0];
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const unsigned short* ptr0 = (const unsigned short*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8(bfloat16_to_float32(ptr0[0]) * scale);
-                    outptr[1] = float2int8(bfloat16_to_float32(ptr0[1]) * scale);
-                    outptr[2] = float2int8(bfloat16_to_float32(ptr0[2]) * scale);
-                    outptr[3] = float2int8(bfloat16_to_float32(ptr0[3]) * scale);
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const unsigned short* ptr0 = (const unsigned short*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8(bfloat16_to_float32(ptr0[0]) * scale_data[i * 4]);
-                    outptr[1] = float2int8(bfloat16_to_float32(ptr0[1]) * scale_data[i * 4 + 1]);
-                    outptr[2] = float2int8(bfloat16_to_float32(ptr0[2]) * scale_data[i * 4 + 2]);
-                    outptr[3] = float2int8(bfloat16_to_float32(ptr0[3]) * scale_data[i * 4 + 3]);
-                }
-            }
-        }
-
-        if (dims == 2)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
-            int outh = h * elempack / out_elempack;
-
-            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (out_elempack == 8)
-            {
-                if (scale_data_size == 1)
-                {
-                    float32x4_t _scale = vdupq_n_f32(scale_data[0]);
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < outh; i++)
-                    {
-                        const unsigned short* ptr0 = bottom_blob.row<const unsigned short>(i * 2);
-                        const unsigned short* ptr1 = bottom_blob.row<const unsigned short>(i * 2 + 1);
-                        signed char* outptr = top_blob.row<signed char>(i);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            float32x4_t _vlow = bfloat2float(vld1_u16(ptr0));
-                            float32x4_t _vhigh = bfloat2float(vld1_u16(ptr1));
-                            _vlow = vmulq_f32(_vlow, _scale);
-                            _vhigh = vmulq_f32(_vhigh, _scale);
-                            int8x8_t _v = float2int8(_vlow, _vhigh);
-                            vst1_s8(outptr, _v);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < outh; i++)
-                    {
-                        const unsigned short* ptr0 = bottom_blob.row<const unsigned short>(i * 2);
-                        const unsigned short* ptr1 = bottom_blob.row<const unsigned short>(i * 2 + 1);
-                        signed char* outptr = top_blob.row<signed char>(i);
-
-                        float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8);
-                        float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            float32x4_t _vlow = bfloat2float(vld1_u16(ptr0));
-                            float32x4_t _vhigh = bfloat2float(vld1_u16(ptr1));
-                            _vlow = vmulq_f32(_vlow, _scale0);
-                            _vhigh = vmulq_f32(_vhigh, _scale1);
-                            int8x8_t _v = float2int8(_vlow, _vhigh);
-                            vst1_s8(outptr, _v);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-            }
-            if (out_elempack == 1)
-            {
-                if (scale_data_size == 1)
-                {
-                    const float scale = scale_data[0];
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < h; i++)
-                    {
-                        const unsigned short* ptr0 = bottom_blob.row<const unsigned short>(i);
-                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            outptr0[0] = float2int8(bfloat16_to_float32(ptr0[0]) * scale);
-                            outptr1[0] = float2int8(bfloat16_to_float32(ptr0[1]) * scale);
-                            outptr2[0] = float2int8(bfloat16_to_float32(ptr0[2]) * scale);
-                            outptr3[0] = float2int8(bfloat16_to_float32(ptr0[3]) * scale);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < h; i++)
-                    {
-                        const unsigned short* ptr0 = bottom_blob.row<const unsigned short>(i);
-                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                        const float s0 = scale_data[i * 4];
-                        const float s1 = scale_data[i * 4 + 1];
-                        const float s2 = scale_data[i * 4 + 2];
-                        const float s3 = scale_data[i * 4 + 3];
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            outptr0[0] = float2int8(bfloat16_to_float32(ptr0[0]) * s0);
-                            outptr1[0] = float2int8(bfloat16_to_float32(ptr0[1]) * s1);
-                            outptr2[0] = float2int8(bfloat16_to_float32(ptr0[2]) * s2);
-                            outptr3[0] = float2int8(bfloat16_to_float32(ptr0[3]) * s3);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-            }
-        }
-
-        if (dims == 3)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int channels = bottom_blob.c;
-            int size = w * h;
-            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
-            int outc = channels * elempack / out_elempack;
-
-            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (out_elempack == 8)
-            {
-                if (scale_data_size == 1)
-                {
-                    float32x4_t _scale = vdupq_n_f32(scale_data[0]);
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < outc; q++)
-                    {
-                        const unsigned short* ptr0 = bottom_blob.channel(q * 2);
-                        const unsigned short* ptr1 = bottom_blob.channel(q * 2 + 1);
-                        signed char* outptr = top_blob.channel(q);
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            float32x4_t _vlow = bfloat2float(vld1_u16(ptr0));
-                            float32x4_t _vhigh = bfloat2float(vld1_u16(ptr1));
-                            _vlow = vmulq_f32(_vlow, _scale);
-                            _vhigh = vmulq_f32(_vhigh, _scale);
-                            int8x8_t _v = float2int8(_vlow, _vhigh);
-                            vst1_s8(outptr, _v);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < outc; q++)
-                    {
-                        const unsigned short* ptr0 = bottom_blob.channel(q * 2);
-                        const unsigned short* ptr1 = bottom_blob.channel(q * 2 + 1);
-                        signed char* outptr = top_blob.channel(q);
-
-                        float32x4_t _scale0 = vld1q_f32((const float*)scale_data + q * 8);
-                        float32x4_t _scale1 = vld1q_f32((const float*)scale_data + q * 8 + 4);
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            float32x4_t _vlow = bfloat2float(vld1_u16(ptr0));
-                            float32x4_t _vhigh = bfloat2float(vld1_u16(ptr1));
-                            _vlow = vmulq_f32(_vlow, _scale0);
-                            _vhigh = vmulq_f32(_vhigh, _scale1);
-                            int8x8_t _v = float2int8(_vlow, _vhigh);
-                            vst1_s8(outptr, _v);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-            }
-            if (out_elempack == 1)
-            {
-                if (scale_data_size == 1)
-                {
-                    const float scale = scale_data[0];
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < channels; q++)
-                    {
-                        const unsigned short* ptr0 = bottom_blob.channel(q);
-                        signed char* outptr0 = top_blob.channel(q * 4);
-                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            outptr0[0] = float2int8(bfloat16_to_float32(ptr0[0]) * scale);
-                            outptr1[0] = float2int8(bfloat16_to_float32(ptr0[1]) * scale);
-                            outptr2[0] = float2int8(bfloat16_to_float32(ptr0[2]) * scale);
-                            outptr3[0] = float2int8(bfloat16_to_float32(ptr0[3]) * scale);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < channels; q++)
-                    {
-                        const unsigned short* ptr0 = bottom_blob.channel(q);
-                        signed char* outptr0 = top_blob.channel(q * 4);
-                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                        const float s0 = scale_data[q * 4];
-                        const float s1 = scale_data[q * 4 + 1];
-                        const float s2 = scale_data[q * 4 + 2];
-                        const float s3 = scale_data[q * 4 + 3];
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            outptr0[0] = float2int8(bfloat16_to_float32(ptr0[0]) * s0);
-                            outptr1[0] = float2int8(bfloat16_to_float32(ptr0[1]) * s1);
-                            outptr2[0] = float2int8(bfloat16_to_float32(ptr0[2]) * s2);
-                            outptr3[0] = float2int8(bfloat16_to_float32(ptr0[3]) * s3);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-            }
+            _scale = vld1q_f32((const float*)scale_data);
         }
+    }
+#endif // __ARM_NEON
 
-        return 0;
+    int i = 0;
+#if __ARM_NEON
+    for (; i + 7 < size; i += 8)
+    {
+        uint16x8_t _v01 = vld1q_u16(ptr);
+        float32x4_t _v0 = bfloat2float(vget_low_u16(_v01));
+        float32x4_t _v1 = bfloat2float(vget_high_u16(_v01));
+        _v0 = vmulq_f32(_v0, _scale);
+        _v1 = vmulq_f32(_v1, _scale);
+        vst1_s8(s8ptr, float2int8(_v0, _v1));
+        ptr += 8;
+        s8ptr += 8;
+    }
+    for (; i + 3 < size; i += 4)
+    {
+        float32x4_t _v = bfloat2float(vld1_u16(ptr));
+        _v = vmulq_f32(_v, _scale);
+        int8x8_t v = float2int8(_v, _v);
+        s8ptr[0] = vget_lane_s8(v, 0);
+        s8ptr[1] = vget_lane_s8(v, 1);
+        s8ptr[2] = vget_lane_s8(v, 2);
+        s8ptr[3] = vget_lane_s8(v, 3);
+        ptr += 4;
+        s8ptr += 4;
     }
 #endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        float v = bfloat16_to_float32(*ptr) * scale;
+        *s8ptr = float2int8(v);
+        ptr++;
+        s8ptr++;
+    }
+}
+
+int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int dims = bottom_blob.dims;
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+    const size_t out_elemsize = elempack * 1u;
 
     if (dims == 1)
     {
-        int w = bottom_blob.w;
-
-        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        const unsigned short* ptr = bottom_blob;
-        signed char* outptr = top_blob;
+        const int wp = std::max(1, w / opt.num_threads);
+        const int nn_w = (w + wp - 1) / wp;
 
-        if (scale_data_size == 1)
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_w; ii++)
         {
-            const float scale = scale_data[0];
+            const int i = ii * wp;
 
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8(bfloat16_to_float32(ptr[i]) * scale);
-            }
-        }
-        else
-        {
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8(bfloat16_to_float32(ptr[i]) * scale_data[i]);
-            }
+            const unsigned short* ptr = (const unsigned short*)bottom_blob + i * elempack;
+            signed char* s8ptr = (signed char*)top_blob + i * elempack;
+
+            // assert scale_data_size == 1
+
+            const int size = std::min(w - i, wp) * elempack;
+
+            quantize_bf16s(ptr, s8ptr, scale_data, size, 1);
         }
     }
 
     if (dims == 2)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-
-        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = 0; i < h; i++)
         {
-            const unsigned short* ptr0 = bottom_blob.row<const unsigned short>(i);
-            signed char* outptr0 = top_blob.row<signed char>(i);
+            const unsigned short* ptr = bottom_blob.row<const unsigned short>(i);
+            signed char* s8ptr = top_blob.row<signed char>(i);
 
-            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+            const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
 
-            for (int j = 0; j < w; j++)
-            {
-                *outptr0++ = float2int8(bfloat16_to_float32(*ptr0++) * scale);
-            }
+            quantize_bf16s(ptr, s8ptr, scale_data_i, w, elempack);
         }
     }
 
     if (dims == 3)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-        int channels = bottom_blob.c;
-        int size = w * h;
-
-        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -898,14 +299,11 @@ int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Opt
         for (int q = 0; q < channels; q++)
         {
             const unsigned short* ptr = bottom_blob.channel(q);
-            signed char* outptr = top_blob.channel(q);
+            signed char* s8ptr = top_blob.channel(q);
 
-            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+            const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
 
-            for (int i = 0; i < size; i++)
-            {
-                *outptr++ = float2int8(bfloat16_to_float32(*ptr++) * scale);
-            }
+            quantize_bf16s(ptr, s8ptr, scale_data_q, w * h, elempack);
         }
     }
 
diff --git a/src/layer/arm/quantize_arm_asimdhp.cpp b/src/layer/arm/quantize_arm_asimdhp.cpp
index 661f06c19cd..ce46fafabc3 100644
--- a/src/layer/arm/quantize_arm_asimdhp.cpp
+++ b/src/layer/arm/quantize_arm_asimdhp.cpp
@@ -23,385 +23,111 @@
 namespace ncnn {
 
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-int Quantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+static void quantize_fp16s(const __fp16* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack)
 {
-    int dims = bottom_blob.dims;
-    int elempack = bottom_blob.elempack;
-
-    if (elempack == 4)
-    {
-        if (dims == 1)
-        {
-            int w = bottom_blob.w;
-            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
-            int outw = w * elempack / out_elempack;
-
-            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-                const float scale = scale_data[0];
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8((float)ptr0[0] * scale);
-                    outptr[1] = float2int8((float)ptr0[1] * scale);
-                    outptr[2] = float2int8((float)ptr0[2] * scale);
-                    outptr[3] = float2int8((float)ptr0[3] * scale);
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8((float)ptr0[0] * scale_data[i * 4]);
-                    outptr[1] = float2int8((float)ptr0[1] * scale_data[i * 4 + 1]);
-                    outptr[2] = float2int8((float)ptr0[2] * scale_data[i * 4 + 2]);
-                    outptr[3] = float2int8((float)ptr0[3] * scale_data[i * 4 + 3]);
-                }
-            }
-        }
+    const int scale_data_size = scale_data.w;
+    const int size = elemcount * elempack;
 
-        if (dims == 2)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
-            int outh = h * elempack / out_elempack;
-
-            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (out_elempack == 8)
-            {
-                if (scale_data_size == 1)
-                {
-                    float32x4_t _scale = vdupq_n_f32(scale_data[0]);
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < outh; i++)
-                    {
-                        const __fp16* ptr0 = bottom_blob.row<const __fp16>(i * 2);
-                        const __fp16* ptr1 = bottom_blob.row<const __fp16>(i * 2 + 1);
-                        signed char* outptr = top_blob.row<signed char>(i);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            float32x4_t _vlow = vcvt_f32_f16(vld1_f16(ptr0));
-                            float32x4_t _vhigh = vcvt_f32_f16(vld1_f16(ptr1));
-                            _vlow = vmulq_f32(_vlow, _scale);
-                            _vhigh = vmulq_f32(_vhigh, _scale);
-                            int8x8_t _v = float2int8(_vlow, _vhigh);
-                            vst1_s8(outptr, _v);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < outh; i++)
-                    {
-                        const __fp16* ptr0 = bottom_blob.row<const __fp16>(i * 2);
-                        const __fp16* ptr1 = bottom_blob.row<const __fp16>(i * 2 + 1);
-                        signed char* outptr = top_blob.row<signed char>(i);
-
-                        float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8);
-                        float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            float32x4_t _vlow = vcvt_f32_f16(vld1_f16(ptr0));
-                            float32x4_t _vhigh = vcvt_f32_f16(vld1_f16(ptr1));
-                            _vlow = vmulq_f32(_vlow, _scale0);
-                            _vhigh = vmulq_f32(_vhigh, _scale1);
-                            int8x8_t _v = float2int8(_vlow, _vhigh);
-                            vst1_s8(outptr, _v);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-            }
-            if (out_elempack == 1)
-            {
-                if (scale_data_size == 1)
-                {
-                    const float scale = scale_data[0];
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < h; i++)
-                    {
-                        const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
-                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            outptr0[0] = float2int8((float)ptr0[0] * scale);
-                            outptr1[0] = float2int8((float)ptr0[1] * scale);
-                            outptr2[0] = float2int8((float)ptr0[2] * scale);
-                            outptr3[0] = float2int8((float)ptr0[3] * scale);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < h; i++)
-                    {
-                        const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
-                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                        const float s0 = scale_data[i * 4];
-                        const float s1 = scale_data[i * 4 + 1];
-                        const float s2 = scale_data[i * 4 + 2];
-                        const float s3 = scale_data[i * 4 + 3];
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            outptr0[0] = float2int8((float)ptr0[0] * s0);
-                            outptr1[0] = float2int8((float)ptr0[1] * s1);
-                            outptr2[0] = float2int8((float)ptr0[2] * s2);
-                            outptr3[0] = float2int8((float)ptr0[3] * s3);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-            }
-        }
+    // NCNN_LOGE("quantize_fp16s %d   %d %d", scale_data_size, elemcount, elempack);
 
-        if (dims == 3)
+    float scale = scale_data[0];
+    float32x4_t _scale = vdupq_n_f32(scale);
+    if (scale_data_size > 1)
+    {
+        if (elempack == 4)
         {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int channels = bottom_blob.c;
-            int size = w * h;
-            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
-            int outc = channels * elempack / out_elempack;
-
-            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (out_elempack == 8)
-            {
-                if (scale_data_size == 1)
-                {
-                    float32x4_t _scale = vdupq_n_f32(scale_data[0]);
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < outc; q++)
-                    {
-                        const __fp16* ptr0 = bottom_blob.channel(q * 2);
-                        const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1);
-                        signed char* outptr = top_blob.channel(q);
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            float32x4_t _vlow = vcvt_f32_f16(vld1_f16(ptr0));
-                            float32x4_t _vhigh = vcvt_f32_f16(vld1_f16(ptr1));
-                            _vlow = vmulq_f32(_vlow, _scale);
-                            _vhigh = vmulq_f32(_vhigh, _scale);
-                            int8x8_t _v = float2int8(_vlow, _vhigh);
-                            vst1_s8(outptr, _v);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < outc; q++)
-                    {
-                        const __fp16* ptr0 = bottom_blob.channel(q * 2);
-                        const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1);
-                        signed char* outptr = top_blob.channel(q);
-
-                        float32x4_t _scale0 = vld1q_f32((const float*)scale_data + q * 8);
-                        float32x4_t _scale1 = vld1q_f32((const float*)scale_data + q * 8 + 4);
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            float32x4_t _vlow = vcvt_f32_f16(vld1_f16(ptr0));
-                            float32x4_t _vhigh = vcvt_f32_f16(vld1_f16(ptr1));
-                            _vlow = vmulq_f32(_vlow, _scale0);
-                            _vhigh = vmulq_f32(_vhigh, _scale1);
-                            int8x8_t _v = float2int8(_vlow, _vhigh);
-                            vst1_s8(outptr, _v);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-            }
-            if (out_elempack == 1)
-            {
-                if (scale_data_size == 1)
-                {
-                    const float scale = scale_data[0];
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < channels; q++)
-                    {
-                        const __fp16* ptr0 = bottom_blob.channel(q);
-                        signed char* outptr0 = top_blob.channel(q * 4);
-                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            outptr0[0] = float2int8((float)ptr0[0] * scale);
-                            outptr1[0] = float2int8((float)ptr0[1] * scale);
-                            outptr2[0] = float2int8((float)ptr0[2] * scale);
-                            outptr3[0] = float2int8((float)ptr0[3] * scale);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < channels; q++)
-                    {
-                        const __fp16* ptr0 = bottom_blob.channel(q);
-                        signed char* outptr0 = top_blob.channel(q * 4);
-                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                        const float s0 = scale_data[q * 4];
-                        const float s1 = scale_data[q * 4 + 1];
-                        const float s2 = scale_data[q * 4 + 2];
-                        const float s3 = scale_data[q * 4 + 3];
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            outptr0[0] = float2int8((float)ptr0[0] * s0);
-                            outptr1[0] = float2int8((float)ptr0[1] * s1);
-                            outptr2[0] = float2int8((float)ptr0[2] * s2);
-                            outptr3[0] = float2int8((float)ptr0[3] * s3);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-            }
+            _scale = vld1q_f32((const float*)scale_data);
         }
+    }
 
-        return 0;
+    int i = 0;
+    for (; i + 7 < size; i += 8)
+    {
+        float16x8_t _v01 = vld1q_f16(ptr);
+        float32x4_t _v0 = vcvt_f32_f16(vget_low_f16(_v01));
+        float32x4_t _v1 = vcvt_f32_f16(vget_high_f16(_v01));
+        _v0 = vmulq_f32(_v0, _scale);
+        _v1 = vmulq_f32(_v1, _scale);
+        vst1_s8(s8ptr, float2int8(_v0, _v1));
+        ptr += 8;
+        s8ptr += 8;
+    }
+    for (; i + 3 < size; i += 4)
+    {
+        float32x4_t _v = vcvt_f32_f16(vld1_f16(ptr));
+        _v = vmulq_f32(_v, _scale);
+        int8x8_t v = float2int8(_v, _v);
+        s8ptr[0] = vget_lane_s8(v, 0);
+        s8ptr[1] = vget_lane_s8(v, 1);
+        s8ptr[2] = vget_lane_s8(v, 2);
+        s8ptr[3] = vget_lane_s8(v, 3);
+        ptr += 4;
+        s8ptr += 4;
     }
+    for (; i < size; i++)
+    {
+        float v = (float)(*ptr) * scale;
+        *s8ptr = float2int8(v);
+        ptr++;
+        s8ptr++;
+    }
+}
+
+int Quantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int dims = bottom_blob.dims;
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+    const size_t out_elemsize = elempack * 1u;
 
     if (dims == 1)
     {
-        int w = bottom_blob.w;
-
-        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        const __fp16* ptr = bottom_blob;
-        signed char* outptr = top_blob;
+        const int wp = std::max(1, w / opt.num_threads);
+        const int nn_w = (w + wp - 1) / wp;
 
-        if (scale_data_size == 1)
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_w; ii++)
         {
-            const float scale = scale_data[0];
+            const int i = ii * wp;
 
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8((float)ptr[i] * scale);
-            }
-        }
-        else
-        {
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8((float)ptr[i] * scale_data[i]);
-            }
+            const __fp16* ptr = (const __fp16*)bottom_blob + i * elempack;
+            signed char* s8ptr = (signed char*)top_blob + i * elempack;
+
+            // assert scale_data_size == 1
+
+            const int size = std::min(w - i, wp) * elempack;
+
+            quantize_fp16s(ptr, s8ptr, scale_data, size, 1);
         }
     }
 
     if (dims == 2)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-
-        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = 0; i < h; i++)
         {
-            const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
-            signed char* outptr0 = top_blob.row<signed char>(i);
+            const __fp16* ptr = bottom_blob.row<const __fp16>(i);
+            signed char* s8ptr = top_blob.row<signed char>(i);
 
-            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+            const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
 
-            for (int j = 0; j < w; j++)
-            {
-                *outptr0++ = float2int8((float)*ptr0++ * scale);
-            }
+            quantize_fp16s(ptr, s8ptr, scale_data_i, w, elempack);
         }
     }
 
     if (dims == 3)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-        int channels = bottom_blob.c;
-        int size = w * h;
-
-        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -409,445 +135,127 @@ int Quantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Opt
         for (int q = 0; q < channels; q++)
         {
             const __fp16* ptr = bottom_blob.channel(q);
-            signed char* outptr = top_blob.channel(q);
+            signed char* s8ptr = top_blob.channel(q);
 
-            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+            const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
 
-            for (int i = 0; i < size; i++)
-            {
-                *outptr++ = float2int8((float)*ptr++ * scale);
-            }
+            quantize_fp16s(ptr, s8ptr, scale_data_q, w * h, elempack);
         }
     }
 
     return 0;
 }
 
-int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+static void quantize_fp16sa(const __fp16* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack)
 {
-    int dims = bottom_blob.dims;
-    int elempack = bottom_blob.elempack;
+    const int scale_data_size = scale_data.w;
+    const int size = elemcount * elempack;
 
-    if (elempack == 8)
-    {
-        if (dims == 1)
-        {
-            int w = bottom_blob.w;
-
-            top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-#if defined(_MSC_VER) && !defined(__clang__)
-                float16x4_t _scale0 = vcvt_f16_f32(vdupq_n_f32(scale_data[0]));
-                float16x8_t _scale = vcombine_f16(_scale0, _scale0);
-#else
-                float16x8_t _scale = vdupq_n_f16((__fp16)scale_data[0]);
-#endif
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8;
-                    signed char* outptr = (signed char*)top_blob + i * 8;
-
-                    float16x8_t _v = vld1q_f16(ptr0);
-                    _v = vmulq_f16(_v, _scale);
-                    vst1_s8(outptr, float2int8(_v));
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8;
-                    signed char* outptr = (signed char*)top_blob + i * 8;
-
-                    float16x8_t _v = vld1q_f16(ptr0);
-                    float16x8_t _scale = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8 + 4)));
-                    _v = vmulq_f16(_v, _scale);
-                    vst1_s8(outptr, float2int8(_v));
-                }
-            }
-        }
+    // NCNN_LOGE("quantize_fp16sa %d   %d %d", scale_data_size, elemcount, elempack);
 
-        if (dims == 2)
+    __fp16 scale = (__fp16)scale_data[0];
+    float16x4_t _scale0 = vdup_n_f16(scale);
+    float16x4_t _scale1 = _scale0;
+    if (scale_data_size > 1)
+    {
+        if (elempack == 8)
         {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-
-            top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-#if defined(_MSC_VER) && !defined(__clang__)
-                float16x4_t _scale0 = vcvt_f16_f32(vdupq_n_f32(scale_data[0]));
-                float16x8_t _scale = vcombine_f16(_scale0, _scale0);
-#else
-                float16x8_t _scale = vdupq_n_f16((__fp16)scale_data[0]);
-#endif
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < h; i++)
-                {
-                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
-                    signed char* outptr0 = top_blob.row<signed char>(i);
-
-                    for (int j = 0; j < w; j++)
-                    {
-                        float16x8_t _v = vld1q_f16(ptr0);
-                        _v = vmulq_f16(_v, _scale);
-                        vst1_s8(outptr0, float2int8(_v));
-
-                        ptr0 += 8;
-                        outptr0 += 8;
-                    }
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < h; i++)
-                {
-                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
-                    signed char* outptr0 = top_blob.row<signed char>(i);
-
-                    float16x8_t _scale = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8 + 4)));
-
-                    for (int j = 0; j < w; j++)
-                    {
-                        float16x8_t _v = vld1q_f16(ptr0);
-                        _v = vmulq_f16(_v, _scale);
-                        vst1_s8(outptr0, float2int8(_v));
-
-                        ptr0 += 8;
-                        outptr0 += 8;
-                    }
-                }
-            }
+            _scale0 = vcvt_f16_f32(vld1q_f32((const float*)scale_data));
+            _scale1 = vcvt_f16_f32(vld1q_f32((const float*)scale_data + 4));
         }
-
-        if (dims == 3)
+        if (elempack == 4)
         {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int channels = bottom_blob.c;
-            int size = w * h;
-
-            top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-#if defined(_MSC_VER) && !defined(__clang__)
-                float16x4_t _scale0 = vcvt_f16_f32(vdupq_n_f32(scale_data[0]));
-                float16x8_t _scale = vcombine_f16(_scale0, _scale0);
-#else
-                float16x8_t _scale = vdupq_n_f16((__fp16)scale_data[0]);
-#endif
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const __fp16* ptr0 = bottom_blob.channel(q);
-                    signed char* outptr0 = top_blob.channel(q);
-
-                    for (int i = 0; i < size; i++)
-                    {
-                        float16x8_t _v = vld1q_f16(ptr0);
-                        _v = vmulq_f16(_v, _scale);
-                        vst1_s8(outptr0, float2int8(_v));
-
-                        ptr0 += 8;
-                        outptr0 += 8;
-                    }
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const __fp16* ptr0 = bottom_blob.channel(q);
-                    signed char* outptr0 = top_blob.channel(q);
-
-                    float16x8_t _scale = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)scale_data + q * 8)), vcvt_f16_f32(vld1q_f32((const float*)scale_data + q * 8 + 4)));
-
-                    for (int i = 0; i < size; i++)
-                    {
-                        float16x8_t _v = vld1q_f16(ptr0);
-                        _v = vmulq_f16(_v, _scale);
-                        vst1_s8(outptr0, float2int8(_v));
-
-                        ptr0 += 8;
-                        outptr0 += 8;
-                    }
-                }
-            }
+            _scale0 = vcvt_f16_f32(vld1q_f32((const float*)scale_data));
+            _scale1 = _scale0;
         }
-
-        return 0;
     }
+    float16x8_t _scale = vcombine_f16(_scale0, _scale1);
 
-    if (elempack == 4)
+    int i = 0;
+    for (; i + 7 < size; i += 8)
     {
-        if (dims == 1)
-        {
-            int w = bottom_blob.w;
-            int outw = w * elempack;
-
-            top_blob.create(outw, (size_t)1u, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-                const __fp16 scale = scale_data[0];
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8(ptr0[0] * scale);
-                    outptr[1] = float2int8(ptr0[1] * scale);
-                    outptr[2] = float2int8(ptr0[2] * scale);
-                    outptr[3] = float2int8(ptr0[3] * scale);
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8(ptr0[0] * (__fp16)scale_data[i * 4]);
-                    outptr[1] = float2int8(ptr0[1] * (__fp16)scale_data[i * 4 + 1]);
-                    outptr[2] = float2int8(ptr0[2] * (__fp16)scale_data[i * 4 + 2]);
-                    outptr[3] = float2int8(ptr0[3] * (__fp16)scale_data[i * 4 + 3]);
-                }
-            }
-        }
-
-        if (dims == 2)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int outh = h * elempack;
-
-            top_blob.create(w, outh, (size_t)1u, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-                const __fp16 scale = scale_data[0];
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < h; i++)
-                {
-                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
-                    signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                    signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                    signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                    signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                    for (int j = 0; j < w; j++)
-                    {
-                        outptr0[0] = float2int8(ptr0[0] * scale);
-                        outptr1[0] = float2int8(ptr0[1] * scale);
-                        outptr2[0] = float2int8(ptr0[2] * scale);
-                        outptr3[0] = float2int8(ptr0[3] * scale);
-
-                        ptr0 += 4;
-                        outptr0 += 1;
-                        outptr1 += 1;
-                        outptr2 += 1;
-                        outptr3 += 1;
-                    }
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < h; i++)
-                {
-                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
-                    signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                    signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                    signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                    signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                    const __fp16 s0 = scale_data[i * 4];
-                    const __fp16 s1 = scale_data[i * 4 + 1];
-                    const __fp16 s2 = scale_data[i * 4 + 2];
-                    const __fp16 s3 = scale_data[i * 4 + 3];
-
-                    for (int j = 0; j < w; j++)
-                    {
-                        outptr0[0] = float2int8(ptr0[0] * s0);
-                        outptr1[0] = float2int8(ptr0[1] * s1);
-                        outptr2[0] = float2int8(ptr0[2] * s2);
-                        outptr3[0] = float2int8(ptr0[3] * s3);
-
-                        ptr0 += 4;
-                        outptr0 += 1;
-                        outptr1 += 1;
-                        outptr2 += 1;
-                        outptr3 += 1;
-                    }
-                }
-            }
-        }
-
-        if (dims == 3)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int channels = bottom_blob.c;
-            int size = w * h;
-            int outc = channels * elempack;
-
-            top_blob.create(w, h, outc, (size_t)1u, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-                const __fp16 scale = scale_data[0];
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const __fp16* ptr0 = bottom_blob.channel(q);
-                    signed char* outptr0 = top_blob.channel(q * 4);
-                    signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                    signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                    signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                    for (int i = 0; i < size; i++)
-                    {
-                        outptr0[0] = float2int8(ptr0[0] * scale);
-                        outptr1[0] = float2int8(ptr0[1] * scale);
-                        outptr2[0] = float2int8(ptr0[2] * scale);
-                        outptr3[0] = float2int8(ptr0[3] * scale);
-
-                        ptr0 += 4;
-                        outptr0 += 1;
-                        outptr1 += 1;
-                        outptr2 += 1;
-                        outptr3 += 1;
-                    }
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const __fp16* ptr0 = bottom_blob.channel(q);
-                    signed char* outptr0 = top_blob.channel(q * 4);
-                    signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                    signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                    signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                    const __fp16 s0 = scale_data[q * 4];
-                    const __fp16 s1 = scale_data[q * 4 + 1];
-                    const __fp16 s2 = scale_data[q * 4 + 2];
-                    const __fp16 s3 = scale_data[q * 4 + 3];
-
-                    for (int i = 0; i < size; i++)
-                    {
-                        outptr0[0] = float2int8(ptr0[0] * s0);
-                        outptr1[0] = float2int8(ptr0[1] * s1);
-                        outptr2[0] = float2int8(ptr0[2] * s2);
-                        outptr3[0] = float2int8(ptr0[3] * s3);
-
-                        ptr0 += 4;
-                        outptr0 += 1;
-                        outptr1 += 1;
-                        outptr2 += 1;
-                        outptr3 += 1;
-                    }
-                }
-            }
-        }
-
-        return 0;
+        float16x8_t _v = vld1q_f16(ptr);
+        _v = vmulq_f16(_v, _scale);
+        vst1_s8(s8ptr, float2int8(_v));
+        ptr += 8;
+        s8ptr += 8;
+    }
+    for (; i + 3 < size; i += 4)
+    {
+        float16x4_t _v = vld1_f16(ptr);
+        _v = vmul_f16(_v, _scale0);
+        int8x8_t v = float2int8(vcombine_f16(_v, _v));
+        s8ptr[0] = vget_lane_s8(v, 0);
+        s8ptr[1] = vget_lane_s8(v, 1);
+        s8ptr[2] = vget_lane_s8(v, 2);
+        s8ptr[3] = vget_lane_s8(v, 3);
+        ptr += 4;
+        s8ptr += 4;
     }
+    for (; i < size; i++)
+    {
+        __fp16 v = *ptr * scale;
+        *s8ptr = float2int8(v);
+        ptr++;
+        s8ptr++;
+    }
+}
+
+int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int dims = bottom_blob.dims;
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+    const size_t out_elemsize = elempack * 1u;
 
     if (dims == 1)
     {
-        int w = bottom_blob.w;
-
-        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        const __fp16* ptr = bottom_blob;
-        signed char* outptr = top_blob;
+        const int wp = std::max(1, w / opt.num_threads);
+        const int nn_w = (w + wp - 1) / wp;
 
-        if (scale_data_size == 1)
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_w; ii++)
         {
-            const __fp16 scale = scale_data[0];
+            const int i = ii * wp;
 
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8(ptr[i] * scale);
-            }
-        }
-        else
-        {
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8(ptr[i] * (__fp16)scale_data[i]);
-            }
+            const __fp16* ptr = (const __fp16*)bottom_blob + i * elempack;
+            signed char* s8ptr = (signed char*)top_blob + i * elempack;
+
+            // assert scale_data_size == 1
+
+            const int size = std::min(w - i, wp) * elempack;
+
+            quantize_fp16sa(ptr, s8ptr, scale_data, size, 1);
         }
     }
 
     if (dims == 2)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-
-        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = 0; i < h; i++)
         {
-            const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
-            signed char* outptr0 = top_blob.row<signed char>(i);
+            const __fp16* ptr = bottom_blob.row<const __fp16>(i);
+            signed char* s8ptr = top_blob.row<signed char>(i);
 
-            const __fp16 scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+            const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
 
-            for (int j = 0; j < w; j++)
-            {
-                *outptr0++ = float2int8(*ptr0++ * scale);
-            }
+            quantize_fp16sa(ptr, s8ptr, scale_data_i, w, elempack);
         }
     }
 
     if (dims == 3)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-        int channels = bottom_blob.c;
-        int size = w * h;
-
-        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -855,14 +263,11 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op
         for (int q = 0; q < channels; q++)
         {
             const __fp16* ptr = bottom_blob.channel(q);
-            signed char* outptr = top_blob.channel(q);
+            signed char* s8ptr = top_blob.channel(q);
 
-            const __fp16 scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+            const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
 
-            for (int i = 0; i < size; i++)
-            {
-                *outptr++ = float2int8(*ptr++ * scale);
-            }
+            quantize_fp16sa(ptr, s8ptr, scale_data_q, w * h, elempack);
         }
     }
 
diff --git a/src/layer/loongarch/quantize_loongarch.cpp b/src/layer/loongarch/quantize_loongarch.cpp
index a0dd618771d..e46bfef925d 100644
--- a/src/layer/loongarch/quantize_loongarch.cpp
+++ b/src/layer/loongarch/quantize_loongarch.cpp
@@ -29,412 +29,115 @@ Quantize_loongarch::Quantize_loongarch()
 #endif
 }
 
-int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack)
 {
-    int dims = bottom_blob.dims;
-    int elempack = bottom_blob.elempack;
+    const int scale_data_size = scale_data.w;
+    const int size = elemcount * elempack;
+
+    // NCNN_LOGE("quantize %d   %d %d", scale_data_size, elemcount, elempack);
 
+    float scale = scale_data[0];
 #if __loongarch_sx
-    if (elempack == 4)
+    __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+    if (scale_data_size > 1)
     {
-        if (dims == 1)
+        if (elempack == 4)
         {
-            int w = bottom_blob.w;
-            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
-            int outw = w * elempack / out_elempack;
-
-            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-                const float scale = scale_data[0];
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const float* ptr0 = (const float*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8(ptr0[0] * scale);
-                    outptr[1] = float2int8(ptr0[1] * scale);
-                    outptr[2] = float2int8(ptr0[2] * scale);
-                    outptr[3] = float2int8(ptr0[3] * scale);
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const float* ptr0 = (const float*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]);
-                    outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]);
-                    outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]);
-                    outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]);
-                }
-            }
-        }
-
-        if (dims == 2)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
-            int outh = h * elempack / out_elempack;
-
-            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (out_elempack == 8)
-            {
-                if (scale_data_size == 1)
-                {
-                    __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]);
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < outh; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i * 2);
-                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
-                        signed char* outptr = top_blob.row<signed char>(i);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            __builtin_prefetch(ptr0 + 16);
-                            __builtin_prefetch(ptr1 + 16);
-                            __m128 _vlow = (__m128)__lsx_vld(ptr0, 0);
-                            __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0);
-                            _vlow = __lsx_vfmul_s(_vlow, _scale);
-                            _vhigh = __lsx_vfmul_s(_vhigh, _scale);
-                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < outh; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i * 2);
-                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
-                        signed char* outptr = top_blob.row<signed char>(i);
-
-                        __m128 _scale0 = (__m128)__lsx_vld((const float*)scale_data + i * 8, 0);
-                        __m128 _scale1 = (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            __builtin_prefetch(ptr0 + 16);
-                            __builtin_prefetch(ptr1 + 16);
-                            __m128 _vlow = (__m128)__lsx_vld(ptr0, 0);
-                            __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0);
-                            _vlow = __lsx_vfmul_s(_vlow, _scale0);
-                            _vhigh = __lsx_vfmul_s(_vhigh, _scale1);
-                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-            }
-            if (out_elempack == 1)
-            {
-                if (scale_data_size == 1)
-                {
-                    const float scale = scale_data[0];
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < h; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i);
-                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * scale);
-                            outptr1[0] = float2int8(ptr0[1] * scale);
-                            outptr2[0] = float2int8(ptr0[2] * scale);
-                            outptr3[0] = float2int8(ptr0[3] * scale);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < h; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i);
-                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                        const float s0 = scale_data[i * 4];
-                        const float s1 = scale_data[i * 4 + 1];
-                        const float s2 = scale_data[i * 4 + 2];
-                        const float s3 = scale_data[i * 4 + 3];
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * s0);
-                            outptr1[0] = float2int8(ptr0[1] * s1);
-                            outptr2[0] = float2int8(ptr0[2] * s2);
-                            outptr3[0] = float2int8(ptr0[3] * s3);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-            }
-        }
-
-        if (dims == 3)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int channels = bottom_blob.c;
-            int size = w * h;
-            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
-            int outc = channels * elempack / out_elempack;
-
-            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (out_elempack == 8)
-            {
-                if (scale_data_size == 1)
-                {
-                    __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]);
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < outc; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q * 2);
-                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
-                        signed char* outptr = top_blob.channel(q);
-
-                        int i = 0;
-                        for (; i + 1 < size; i += 2)
-                        {
-                            __builtin_prefetch(ptr0 + 32);
-                            __builtin_prefetch(ptr1 + 32);
-                            __m128 _v0 = (__m128)__lsx_vld(ptr0, 0);
-                            __m128 _v1 = (__m128)__lsx_vld(ptr0 + 4, 0);
-                            __m128 _v2 = (__m128)__lsx_vld(ptr1, 0);
-                            __m128 _v3 = (__m128)__lsx_vld(ptr1 + 4, 0);
-                            _v0 = __lsx_vfmul_s(_v0, _scale);
-                            _v1 = __lsx_vfmul_s(_v1, _scale);
-                            _v2 = __lsx_vfmul_s(_v2, _scale);
-                            _v3 = __lsx_vfmul_s(_v3, _scale);
-                            *((int64_t*)outptr) = float2int8(_v0, _v2);
-                            *((int64_t*)(outptr + 8)) = float2int8(_v1, _v3);
-
-                            ptr0 += 8;
-                            ptr1 += 8;
-                            outptr += 16;
-                        }
-                        for (; i < size; i++)
-                        {
-                            __builtin_prefetch(ptr0 + 16);
-                            __builtin_prefetch(ptr1 + 16);
-                            __m128 _vlow = (__m128)__lsx_vld(ptr0, 0);
-                            __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0);
-                            _vlow = __lsx_vfmul_s(_vlow, _scale);
-                            _vhigh = __lsx_vfmul_s(_vhigh, _scale);
-                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < outc; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q * 2);
-                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
-                        signed char* outptr = top_blob.channel(q);
-
-                        __m128 _scale0 = (__m128)__lsx_vld((const float*)scale_data + q * 8, 0);
-                        __m128 _scale1 = (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0);
-
-                        int i = 0;
-                        for (; i < size; i++)
-                        {
-                            __builtin_prefetch(ptr0 + 16);
-                            __builtin_prefetch(ptr1 + 16);
-                            __m128 _vlow = (__m128)__lsx_vld(ptr0, 0);
-                            __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0);
-                            _vlow = __lsx_vfmul_s(_vlow, _scale0);
-                            _vhigh = __lsx_vfmul_s(_vhigh, _scale1);
-                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-            }
-            if (out_elempack == 1)
-            {
-                if (scale_data_size == 1)
-                {
-                    const float scale = scale_data[0];
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < channels; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q);
-                        signed char* outptr0 = top_blob.channel(q * 4);
-                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * scale);
-                            outptr1[0] = float2int8(ptr0[1] * scale);
-                            outptr2[0] = float2int8(ptr0[2] * scale);
-                            outptr3[0] = float2int8(ptr0[3] * scale);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < channels; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q);
-                        signed char* outptr0 = top_blob.channel(q * 4);
-                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                        const float s0 = scale_data[q * 4];
-                        const float s1 = scale_data[q * 4 + 1];
-                        const float s2 = scale_data[q * 4 + 2];
-                        const float s3 = scale_data[q * 4 + 3];
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * s0);
-                            outptr1[0] = float2int8(ptr0[1] * s1);
-                            outptr2[0] = float2int8(ptr0[2] * s2);
-                            outptr3[0] = float2int8(ptr0[3] * s3);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-            }
+            _scale = (__m128)__lsx_vld((const float*)scale_data, 0);
         }
+    }
+#endif // __loongarch_sx
 
-        return 0;
+    int i = 0;
+#if __loongarch_sx
+    for (; i + 7 < size; i += 8)
+    {
+        __builtin_prefetch(ptr + 32);
+        __m128 _v0 = (__m128)__lsx_vld(ptr, 0);
+        __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0);
+        _v0 = __lsx_vfmul_s(_v0, _scale);
+        _v1 = __lsx_vfmul_s(_v1, _scale);
+        *((int64_t*)s8ptr) = float2int8(_v0, _v1);
+        ptr += 8;
+        s8ptr += 8;
+    }
+    for (; i + 3 < size; i += 4)
+    {
+        __m128 _v = (__m128)__lsx_vld(ptr, 0);
+        _v = __lsx_vfmul_s(_v, _scale);
+        v16i8 v = (v16i8)float2int8(_v, _v);
+        s8ptr[0] = v[0];
+        s8ptr[1] = v[1];
+        s8ptr[2] = v[2];
+        s8ptr[3] = v[3];
+        ptr += 4;
+        s8ptr += 4;
     }
 #endif // __loongarch_sx
+    for (; i < size; i++)
+    {
+        float v = *ptr * scale;
+        *s8ptr = float2int8(v);
+        ptr++;
+        s8ptr++;
+    }
+}
+
+int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int dims = bottom_blob.dims;
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+    const size_t out_elemsize = elempack * 1u;
 
     if (dims == 1)
     {
-        int w = bottom_blob.w;
-
-        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        const float* ptr = bottom_blob;
-        signed char* outptr = top_blob;
+        const int wp = std::max(1, w / opt.num_threads);
+        const int nn_w = (w + wp - 1) / wp;
 
-        if (scale_data_size == 1)
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_w; ii++)
         {
-            const float scale = scale_data[0];
+            const int i = ii * wp;
 
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8(ptr[i] * scale);
-            }
-        }
-        else
-        {
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8(ptr[i] * scale_data[i]);
-            }
+            const float* ptr = (const float*)bottom_blob + i * elempack;
+            signed char* s8ptr = (signed char*)top_blob + i * elempack;
+
+            // assert scale_data_size == 1
+
+            const int size = std::min(w - i, wp) * elempack;
+
+            quantize(ptr, s8ptr, scale_data, size, 1);
         }
     }
 
     if (dims == 2)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-
-        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = 0; i < h; i++)
         {
-            const float* ptr0 = bottom_blob.row(i);
-            signed char* outptr0 = top_blob.row<signed char>(i);
+            const float* ptr = bottom_blob.row(i);
+            signed char* s8ptr = top_blob.row<signed char>(i);
 
-            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+            const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
 
-            for (int j = 0; j < w; j++)
-            {
-                *outptr0++ = float2int8(*ptr0++ * scale);
-            }
+            quantize(ptr, s8ptr, scale_data_i, w, elempack);
         }
     }
 
     if (dims == 3)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-        int channels = bottom_blob.c;
-        int size = w * h;
-
-        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -442,47 +145,11 @@ int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
         for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
-            signed char* outptr = top_blob.channel(q);
+            signed char* s8ptr = top_blob.channel(q);
 
-            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+            const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
 
-            int i = 0;
-#if __loongarch_sx
-            __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
-            for (; i + 15 < size; i += 16)
-            {
-                __builtin_prefetch(ptr + 64);
-                __m128 _v0 = (__m128)__lsx_vld(ptr, 0);
-                __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0);
-                __m128 _v2 = (__m128)__lsx_vld(ptr + 8, 0);
-                __m128 _v3 = (__m128)__lsx_vld(ptr + 12, 0);
-                _v0 = __lsx_vfmul_s(_v0, _scale);
-                _v1 = __lsx_vfmul_s(_v1, _scale);
-                _v2 = __lsx_vfmul_s(_v2, _scale);
-                _v3 = __lsx_vfmul_s(_v3, _scale);
-                *((int64_t*)outptr) = float2int8(_v0, _v1);
-                *((int64_t*)(outptr + 8)) = float2int8(_v2, _v3);
-
-                ptr += 16;
-                outptr += 16;
-            }
-            for (; i + 7 < size; i += 8)
-            {
-                __builtin_prefetch(ptr + 32);
-                __m128 _v0 = (__m128)__lsx_vld(ptr, 0);
-                __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0);
-                _v0 = __lsx_vfmul_s(_v0, _scale);
-                _v1 = __lsx_vfmul_s(_v1, _scale);
-                *((int64_t*)outptr) = float2int8(_v0, _v1);
-
-                ptr += 8;
-                outptr += 8;
-            }
-#endif // __loongarch_sx
-            for (; i < size; i++)
-            {
-                *outptr++ = float2int8(*ptr++ * scale);
-            }
+            quantize(ptr, s8ptr, scale_data_q, w * h, elempack);
         }
     }
 
diff --git a/src/layer/mips/quantize_mips.cpp b/src/layer/mips/quantize_mips.cpp
index 963d0908ce4..638e5770cec 100644
--- a/src/layer/mips/quantize_mips.cpp
+++ b/src/layer/mips/quantize_mips.cpp
@@ -29,412 +29,115 @@ Quantize_mips::Quantize_mips()
 #endif
 }
 
-int Quantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack)
 {
-    int dims = bottom_blob.dims;
-    int elempack = bottom_blob.elempack;
+    const int scale_data_size = scale_data.w;
+    const int size = elemcount * elempack;
+
+    // NCNN_LOGE("quantize %d   %d %d", scale_data_size, elemcount, elempack);
 
+    float scale = scale_data[0];
 #if __mips_msa
-    if (elempack == 4)
+    v4f32 _scale = (v4f32)__msa_fill_w_f32(scale);
+    if (scale_data_size > 1)
     {
-        if (dims == 1)
+        if (elempack == 4)
         {
-            int w = bottom_blob.w;
-            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
-            int outw = w * elempack / out_elempack;
-
-            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-                const float scale = scale_data[0];
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const float* ptr0 = (const float*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8(ptr0[0] * scale);
-                    outptr[1] = float2int8(ptr0[1] * scale);
-                    outptr[2] = float2int8(ptr0[2] * scale);
-                    outptr[3] = float2int8(ptr0[3] * scale);
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const float* ptr0 = (const float*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]);
-                    outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]);
-                    outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]);
-                    outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]);
-                }
-            }
-        }
-
-        if (dims == 2)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
-            int outh = h * elempack / out_elempack;
-
-            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (out_elempack == 8)
-            {
-                if (scale_data_size == 1)
-                {
-                    v4f32 _scale = (v4f32)__msa_fill_w_f32(scale_data[0]);
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < outh; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i * 2);
-                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
-                        signed char* outptr = top_blob.row<signed char>(i);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            __builtin_prefetch(ptr0 + 16);
-                            __builtin_prefetch(ptr1 + 16);
-                            v4f32 _vlow = (v4f32)__msa_ld_w(ptr0, 0);
-                            v4f32 _vhigh = (v4f32)__msa_ld_w(ptr1, 0);
-                            _vlow = __msa_fmul_w(_vlow, _scale);
-                            _vhigh = __msa_fmul_w(_vhigh, _scale);
-                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < outh; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i * 2);
-                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
-                        signed char* outptr = top_blob.row<signed char>(i);
-
-                        v4f32 _scale0 = (v4f32)__msa_ld_w((const float*)scale_data + i * 8, 0);
-                        v4f32 _scale1 = (v4f32)__msa_ld_w((const float*)scale_data + i * 8 + 4, 0);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            __builtin_prefetch(ptr0 + 16);
-                            __builtin_prefetch(ptr1 + 16);
-                            v4f32 _vlow = (v4f32)__msa_ld_w(ptr0, 0);
-                            v4f32 _vhigh = (v4f32)__msa_ld_w(ptr1, 0);
-                            _vlow = __msa_fmul_w(_vlow, _scale0);
-                            _vhigh = __msa_fmul_w(_vhigh, _scale1);
-                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-            }
-            if (out_elempack == 1)
-            {
-                if (scale_data_size == 1)
-                {
-                    const float scale = scale_data[0];
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < h; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i);
-                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * scale);
-                            outptr1[0] = float2int8(ptr0[1] * scale);
-                            outptr2[0] = float2int8(ptr0[2] * scale);
-                            outptr3[0] = float2int8(ptr0[3] * scale);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < h; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i);
-                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                        const float s0 = scale_data[i * 4];
-                        const float s1 = scale_data[i * 4 + 1];
-                        const float s2 = scale_data[i * 4 + 2];
-                        const float s3 = scale_data[i * 4 + 3];
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * s0);
-                            outptr1[0] = float2int8(ptr0[1] * s1);
-                            outptr2[0] = float2int8(ptr0[2] * s2);
-                            outptr3[0] = float2int8(ptr0[3] * s3);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-            }
-        }
-
-        if (dims == 3)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int channels = bottom_blob.c;
-            int size = w * h;
-            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
-            int outc = channels * elempack / out_elempack;
-
-            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (out_elempack == 8)
-            {
-                if (scale_data_size == 1)
-                {
-                    v4f32 _scale = (v4f32)__msa_fill_w_f32(scale_data[0]);
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < outc; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q * 2);
-                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
-                        signed char* outptr = top_blob.channel(q);
-
-                        int i = 0;
-                        for (; i + 1 < size; i += 2)
-                        {
-                            __builtin_prefetch(ptr0 + 32);
-                            __builtin_prefetch(ptr1 + 32);
-                            v4f32 _v0 = (v4f32)__msa_ld_w(ptr0, 0);
-                            v4f32 _v1 = (v4f32)__msa_ld_w(ptr0 + 4, 0);
-                            v4f32 _v2 = (v4f32)__msa_ld_w(ptr1, 0);
-                            v4f32 _v3 = (v4f32)__msa_ld_w(ptr1 + 4, 0);
-                            _v0 = __msa_fmul_w(_v0, _scale);
-                            _v1 = __msa_fmul_w(_v1, _scale);
-                            _v2 = __msa_fmul_w(_v2, _scale);
-                            _v3 = __msa_fmul_w(_v3, _scale);
-                            *((int64_t*)outptr) = float2int8(_v0, _v2);
-                            *((int64_t*)(outptr + 8)) = float2int8(_v1, _v3);
-
-                            ptr0 += 8;
-                            ptr1 += 8;
-                            outptr += 16;
-                        }
-                        for (; i < size; i++)
-                        {
-                            __builtin_prefetch(ptr0 + 16);
-                            __builtin_prefetch(ptr1 + 16);
-                            v4f32 _vlow = (v4f32)__msa_ld_w(ptr0, 0);
-                            v4f32 _vhigh = (v4f32)__msa_ld_w(ptr1, 0);
-                            _vlow = __msa_fmul_w(_vlow, _scale);
-                            _vhigh = __msa_fmul_w(_vhigh, _scale);
-                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < outc; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q * 2);
-                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
-                        signed char* outptr = top_blob.channel(q);
-
-                        v4f32 _scale0 = (v4f32)__msa_ld_w((const float*)scale_data + q * 8, 0);
-                        v4f32 _scale1 = (v4f32)__msa_ld_w((const float*)scale_data + q * 8 + 4, 0);
-
-                        int i = 0;
-                        for (; i < size; i++)
-                        {
-                            __builtin_prefetch(ptr0 + 16);
-                            __builtin_prefetch(ptr1 + 16);
-                            v4f32 _vlow = (v4f32)__msa_ld_w(ptr0, 0);
-                            v4f32 _vhigh = (v4f32)__msa_ld_w(ptr1, 0);
-                            _vlow = __msa_fmul_w(_vlow, _scale0);
-                            _vhigh = __msa_fmul_w(_vhigh, _scale1);
-                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-            }
-            if (out_elempack == 1)
-            {
-                if (scale_data_size == 1)
-                {
-                    const float scale = scale_data[0];
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < channels; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q);
-                        signed char* outptr0 = top_blob.channel(q * 4);
-                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * scale);
-                            outptr1[0] = float2int8(ptr0[1] * scale);
-                            outptr2[0] = float2int8(ptr0[2] * scale);
-                            outptr3[0] = float2int8(ptr0[3] * scale);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < channels; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q);
-                        signed char* outptr0 = top_blob.channel(q * 4);
-                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                        const float s0 = scale_data[q * 4];
-                        const float s1 = scale_data[q * 4 + 1];
-                        const float s2 = scale_data[q * 4 + 2];
-                        const float s3 = scale_data[q * 4 + 3];
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * s0);
-                            outptr1[0] = float2int8(ptr0[1] * s1);
-                            outptr2[0] = float2int8(ptr0[2] * s2);
-                            outptr3[0] = float2int8(ptr0[3] * s3);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-            }
+            _scale = (v4f32)__msa_ld_w((const float*)scale_data, 0);
         }
+    }
+#endif // __mips_msa
 
-        return 0;
+    int i = 0;
+#if __mips_msa
+    for (; i + 7 < size; i += 8)
+    {
+        __builtin_prefetch(ptr + 32);
+        v4f32 _v0 = (v4f32)__msa_ld_w(ptr, 0);
+        v4f32 _v1 = (v4f32)__msa_ld_w(ptr + 4, 0);
+        _v0 = __msa_fmul_w(_v0, _scale);
+        _v1 = __msa_fmul_w(_v1, _scale);
+        *((int64_t*)s8ptr) = float2int8(_v0, _v1);
+        ptr += 8;
+        s8ptr += 8;
+    }
+    for (; i + 3 < size; i += 4)
+    {
+        v4f32 _v = (v4f32)__msa_ld_w(ptr, 0);
+        _v = __msa_fmul_w(_v, _scale);
+        v16i8 v = float2int8(_v, _v);
+        s8ptr[0] = v[0];
+        s8ptr[1] = v[1];
+        s8ptr[2] = v[2];
+        s8ptr[3] = v[3];
+        ptr += 4;
+        s8ptr += 4;
     }
 #endif // __mips_msa
+    for (; i < size; i++)
+    {
+        float v = *ptr * scale;
+        *s8ptr = float2int8(v);
+        ptr++;
+        s8ptr++;
+    }
+}
+
+int Quantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int dims = bottom_blob.dims;
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+    const size_t out_elemsize = elempack * 1u;
 
     if (dims == 1)
     {
-        int w = bottom_blob.w;
-
-        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        const float* ptr = bottom_blob;
-        signed char* outptr = top_blob;
+        const int wp = std::max(1, w / opt.num_threads);
+        const int nn_w = (w + wp - 1) / wp;
 
-        if (scale_data_size == 1)
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_w; ii++)
         {
-            const float scale = scale_data[0];
+            const int i = ii * wp;
 
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8(ptr[i] * scale);
-            }
-        }
-        else
-        {
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8(ptr[i] * scale_data[i]);
-            }
+            const float* ptr = (const float*)bottom_blob + i * elempack;
+            signed char* s8ptr = (signed char*)top_blob + i * elempack;
+
+            // assert scale_data_size == 1
+
+            const int size = std::min(w - i, wp) * elempack;
+
+            quantize(ptr, s8ptr, scale_data, size, 1);
         }
     }
 
     if (dims == 2)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-
-        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = 0; i < h; i++)
         {
-            const float* ptr0 = bottom_blob.row(i);
-            signed char* outptr0 = top_blob.row<signed char>(i);
+            const float* ptr = bottom_blob.row(i);
+            signed char* s8ptr = top_blob.row<signed char>(i);
 
-            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+            const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
 
-            for (int j = 0; j < w; j++)
-            {
-                *outptr0++ = float2int8(*ptr0++ * scale);
-            }
+            quantize(ptr, s8ptr, scale_data_i, w, elempack);
         }
     }
 
     if (dims == 3)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-        int channels = bottom_blob.c;
-        int size = w * h;
-
-        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -442,47 +145,11 @@ int Quantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
-            signed char* outptr = top_blob.channel(q);
+            signed char* s8ptr = top_blob.channel(q);
 
-            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+            const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
 
-            int i = 0;
-#if __mips_msa
-            v4f32 _scale = (v4f32)__msa_fill_w_f32(scale);
-            for (; i + 15 < size; i += 16)
-            {
-                __builtin_prefetch(ptr + 64);
-                v4f32 _v0 = (v4f32)__msa_ld_w(ptr, 0);
-                v4f32 _v1 = (v4f32)__msa_ld_w(ptr + 4, 0);
-                v4f32 _v2 = (v4f32)__msa_ld_w(ptr + 8, 0);
-                v4f32 _v3 = (v4f32)__msa_ld_w(ptr + 12, 0);
-                _v0 = __msa_fmul_w(_v0, _scale);
-                _v1 = __msa_fmul_w(_v1, _scale);
-                _v2 = __msa_fmul_w(_v2, _scale);
-                _v3 = __msa_fmul_w(_v3, _scale);
-                *((int64_t*)outptr) = float2int8(_v0, _v1);
-                *((int64_t*)(outptr + 8)) = float2int8(_v2, _v3);
-
-                ptr += 16;
-                outptr += 16;
-            }
-            for (; i + 7 < size; i += 8)
-            {
-                __builtin_prefetch(ptr + 32);
-                v4f32 _v0 = (v4f32)__msa_ld_w(ptr, 0);
-                v4f32 _v1 = (v4f32)__msa_ld_w(ptr + 4, 0);
-                _v0 = __msa_fmul_w(_v0, _scale);
-                _v1 = __msa_fmul_w(_v1, _scale);
-                *((int64_t*)outptr) = float2int8(_v0, _v1);
-
-                ptr += 8;
-                outptr += 8;
-            }
-#endif // __mips_msa
-            for (; i < size; i++)
-            {
-                *outptr++ = float2int8(*ptr++ * scale);
-            }
+            quantize(ptr, s8ptr, scale_data_q, w * h, elempack);
         }
     }
 
diff --git a/src/layer/x86/quantize_x86.cpp b/src/layer/x86/quantize_x86.cpp
index 8f7ee993673..302cf26ae6f 100644
--- a/src/layer/x86/quantize_x86.cpp
+++ b/src/layer/x86/quantize_x86.cpp
@@ -32,673 +32,165 @@ Quantize_x86::Quantize_x86()
 #endif // __SSE2__
 }
 
-int Quantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack)
 {
-    int dims = bottom_blob.dims;
-    int elempack = bottom_blob.elempack;
+    const int scale_data_size = scale_data.w;
+    const int size = elemcount * elempack;
+
+    // NCNN_LOGE("quantize %d   %d %d", scale_data_size, elemcount, elempack);
 
+    float scale = scale_data[0];
 #if __SSE2__
+    __m128 _scale = _mm_set1_ps(scale);
 #if __AVX__
+    __m256 _scale_avx = _mm256_set1_ps(scale);
 #if __AVX512F__
-    if (elempack == 16)
-    {
-        Mat tmp;
-        convert_packing(bottom_blob, tmp, 8, opt);
-
-        forward(tmp, top_blob, opt);
-
-        return 0;
-    }
+    __m512 _scale_avx512 = _mm512_set1_ps(scale);
 #endif // __AVX512F__
-
-    if (elempack == 8)
+#endif // __AVX__
+    if (scale_data_size > 1)
     {
-        if (dims == 1)
+#if __AVX__
+#if __AVX512F__
+        if (elempack == 16)
         {
-            int w = bottom_blob.w;
-
-            top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-                __m256 _scale = _mm256_set1_ps(scale_data[0]);
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const float* ptr = (const float*)bottom_blob + i * 8;
-                    signed char* outptr = (signed char*)top_blob + i * 8;
-
-                    __m256 _v = _mm256_loadu_ps(ptr);
-                    _v = _mm256_mul_ps(_v, _scale);
-                    *(int64_t*)outptr = float2int8_avx(_v);
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const float* ptr = (const float*)bottom_blob + i * 8;
-                    signed char* outptr = (signed char*)top_blob + i * 8;
-
-                    __m256 _v = _mm256_loadu_ps(ptr);
-                    __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8);
-                    _v = _mm256_mul_ps(_v, _scale);
-                    *(int64_t*)outptr = float2int8_avx(_v);
-                }
-            }
+            _scale_avx512 = _mm512_loadu_ps((const float*)scale_data);
         }
-
-        if (dims == 2)
+#endif // __AVX512F__
+        if (elempack == 8)
         {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-
-            top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-                __m256 _scale = _mm256_set1_ps(scale_data[0]);
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < h; i++)
-                {
-                    const float* ptr = bottom_blob.row(i);
-                    signed char* outptr = top_blob.row<signed char>(i);
-
-                    int j = 0;
-                    for (; j + 1 < w; j += 2)
-                    {
-                        __m256 _v0 = _mm256_loadu_ps(ptr);
-                        __m256 _v1 = _mm256_loadu_ps(ptr + 8);
-                        _v0 = _mm256_mul_ps(_v0, _scale);
-                        _v1 = _mm256_mul_ps(_v1, _scale);
-                        __m128i _v = float2int8_avx(_v0, _v1);
-                        _mm_storeu_si128((__m128i*)outptr, _v);
-
-                        ptr += 16;
-                        outptr += 16;
-                    }
-                    for (; j < w; j++)
-                    {
-                        __m256 _v = _mm256_loadu_ps(ptr);
-                        _v = _mm256_mul_ps(_v, _scale);
-                        *(int64_t*)outptr = float2int8_avx(_v);
-
-                        ptr += 8;
-                        outptr += 8;
-                    }
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < h; i++)
-                {
-                    const float* ptr = bottom_blob.row(i);
-                    signed char* outptr = top_blob.row<signed char>(i);
-
-                    __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8);
-
-                    int j = 0;
-                    for (; j + 1 < w; j += 2)
-                    {
-                        __m256 _v0 = _mm256_loadu_ps(ptr);
-                        __m256 _v1 = _mm256_loadu_ps(ptr + 8);
-                        _v0 = _mm256_mul_ps(_v0, _scale);
-                        _v1 = _mm256_mul_ps(_v1, _scale);
-                        __m128i _v = float2int8_avx(_v0, _v1);
-                        _mm_storeu_si128((__m128i*)outptr, _v);
-
-                        ptr += 16;
-                        outptr += 16;
-                    }
-                    for (; j < w; j++)
-                    {
-                        __m256 _v = _mm256_loadu_ps(ptr);
-                        _v = _mm256_mul_ps(_v, _scale);
-                        *(int64_t*)outptr = float2int8_avx(_v);
-
-                        ptr += 8;
-                        outptr += 8;
-                    }
-                }
-            }
+            _scale_avx = _mm256_loadu_ps((const float*)scale_data);
+#if __AVX512F__
+            _scale_avx512 = combine8x2_ps(_scale_avx, _scale_avx);
+#endif // __AVX512F__
         }
-
-        if (dims == 3)
+#endif // __AVX__
+        if (elempack == 4)
         {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int channels = bottom_blob.c;
-            int size = w * h;
-
-            top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-                __m256 _scale = _mm256_set1_ps(scale_data[0]);
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const float* ptr = bottom_blob.channel(q);
-                    signed char* outptr = top_blob.channel(q);
-
-                    int i = 0;
-                    for (; i + 1 < size; i += 2)
-                    {
-                        __m256 _v0 = _mm256_loadu_ps(ptr);
-                        __m256 _v1 = _mm256_loadu_ps(ptr + 8);
-                        _v0 = _mm256_mul_ps(_v0, _scale);
-                        _v1 = _mm256_mul_ps(_v1, _scale);
-                        __m128i _v = float2int8_avx(_v0, _v1);
-                        _mm_storeu_si128((__m128i*)outptr, _v);
-
-                        ptr += 16;
-                        outptr += 16;
-                    }
-                    for (; i < size; i++)
-                    {
-                        __m256 _v = _mm256_loadu_ps(ptr);
-                        _v = _mm256_mul_ps(_v, _scale);
-                        *(int64_t*)outptr = float2int8_avx(_v);
-
-                        ptr += 8;
-                        outptr += 8;
-                    }
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const float* ptr = bottom_blob.channel(q);
-                    signed char* outptr = top_blob.channel(q);
-
-                    __m256 _scale = _mm256_loadu_ps((const float*)scale_data + q * 8);
-
-                    int i = 0;
-                    for (; i + 1 < size; i += 2)
-                    {
-                        __m256 _v0 = _mm256_loadu_ps(ptr);
-                        __m256 _v1 = _mm256_loadu_ps(ptr + 8);
-                        _v0 = _mm256_mul_ps(_v0, _scale);
-                        _v1 = _mm256_mul_ps(_v1, _scale);
-                        __m128i _v = float2int8_avx(_v0, _v1);
-                        _mm_storeu_si128((__m128i*)outptr, _v);
-
-                        ptr += 16;
-                        outptr += 16;
-                    }
-                    for (; i < size; i++)
-                    {
-                        __m256 _v = _mm256_loadu_ps(ptr);
-                        _v = _mm256_mul_ps(_v, _scale);
-                        *(int64_t*)outptr = float2int8_avx(_v);
-
-                        ptr += 8;
-                        outptr += 8;
-                    }
-                }
-            }
+            _scale = _mm_loadu_ps((const float*)scale_data);
+#if __AVX__
+            _scale_avx = combine4x2_ps(_scale, _scale);
+#if __AVX512F__
+            _scale_avx512 = combine8x2_ps(_scale_avx, _scale_avx);
+#endif // __AVX512F__
+#endif // __AVX__
         }
+    }
+#endif // __SSE2__
 
-        return 0;
+    int i = 0;
+#if __SSE2__
+#if __AVX__
+    for (; i + 15 < size; i += 16)
+    {
+#if __AVX512F__
+        __m512 _v = _mm512_loadu_ps(ptr);
+        _v = _mm512_mul_ps(_v, _scale_avx512);
+        _mm_storeu_si128((__m128i*)s8ptr, float2int8_avx512(_v));
+#else  // __AVX512F__
+        __m256 _v0 = _mm256_loadu_ps(ptr);
+        __m256 _v1 = _mm256_loadu_ps(ptr + 8);
+        _v0 = _mm256_mul_ps(_v0, _scale_avx);
+        _v1 = _mm256_mul_ps(_v1, _scale_avx);
+        _mm_storeu_si128((__m128i*)s8ptr, float2int8_avx(_v0, _v1));
+#endif // __AVX512F__
+        ptr += 16;
+        s8ptr += 16;
     }
 #endif // __AVX__
-
-    if (elempack == 4)
+    for (; i + 7 < size; i += 8)
     {
-        if (dims == 1)
-        {
-            int w = bottom_blob.w;
-            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
-            int outw = w * elempack / out_elempack;
-
-            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (scale_data_size == 1)
-            {
-                const float scale = scale_data[0];
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const float* ptr0 = (const float*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8(ptr0[0] * scale);
-                    outptr[1] = float2int8(ptr0[1] * scale);
-                    outptr[2] = float2int8(ptr0[2] * scale);
-                    outptr[3] = float2int8(ptr0[3] * scale);
-                }
-            }
-            else
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    const float* ptr0 = (const float*)bottom_blob + i * 4;
-                    signed char* outptr = (signed char*)top_blob + i * 4;
-
-                    outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]);
-                    outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]);
-                    outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]);
-                    outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]);
-                }
-            }
-        }
-
-        if (dims == 2)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
-            int outh = h * elempack / out_elempack;
-
-            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (out_elempack == 8)
-            {
-                if (scale_data_size == 1)
-                {
-                    __m128 _scale = _mm_set1_ps(scale_data[0]);
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < outh; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i * 2);
-                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
-                        signed char* outptr = top_blob.row<signed char>(i);
-
-                        int j = 0;
-                        for (; j + 1 < w; j += 2)
-                        {
-                            __m128 _v0 = _mm_loadu_ps(ptr0);
-                            __m128 _v1 = _mm_loadu_ps(ptr1);
-                            __m128 _v2 = _mm_loadu_ps(ptr0 + 4);
-                            __m128 _v3 = _mm_loadu_ps(ptr1 + 4);
-                            _v0 = _mm_mul_ps(_v0, _scale);
-                            _v1 = _mm_mul_ps(_v1, _scale);
-                            _v2 = _mm_mul_ps(_v2, _scale);
-                            _v3 = _mm_mul_ps(_v3, _scale);
-                            __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3);
-                            _mm_storeu_si128((__m128i*)outptr, _v);
-
-                            ptr0 += 8;
-                            ptr1 += 8;
-                            outptr += 16;
-                        }
-                        for (; j < w; j++)
-                        {
-                            __m128 _vlow = _mm_loadu_ps(ptr0);
-                            __m128 _vhigh = _mm_loadu_ps(ptr1);
-                            _vlow = _mm_mul_ps(_vlow, _scale);
-                            _vhigh = _mm_mul_ps(_vhigh, _scale);
-                            *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < outh; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i * 2);
-                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
-                        signed char* outptr = top_blob.row<signed char>(i);
-
-                        __m128 _scale0 = _mm_loadu_ps((const float*)scale_data + i * 8);
-                        __m128 _scale1 = _mm_loadu_ps((const float*)scale_data + i * 8 + 4);
-
-                        int j = 0;
-                        for (; j + 1 < w; j += 2)
-                        {
-                            __m128 _v0 = _mm_loadu_ps(ptr0);
-                            __m128 _v1 = _mm_loadu_ps(ptr1);
-                            __m128 _v2 = _mm_loadu_ps(ptr0 + 4);
-                            __m128 _v3 = _mm_loadu_ps(ptr1 + 4);
-                            _v0 = _mm_mul_ps(_v0, _scale0);
-                            _v1 = _mm_mul_ps(_v1, _scale1);
-                            _v2 = _mm_mul_ps(_v2, _scale0);
-                            _v3 = _mm_mul_ps(_v3, _scale1);
-                            __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3);
-                            _mm_storeu_si128((__m128i*)outptr, _v);
-
-                            ptr0 += 8;
-                            ptr1 += 8;
-                            outptr += 16;
-                        }
-                        for (; j < w; j++)
-                        {
-                            __m128 _vlow = _mm_loadu_ps(ptr0);
-                            __m128 _vhigh = _mm_loadu_ps(ptr1);
-                            _vlow = _mm_mul_ps(_vlow, _scale0);
-                            _vhigh = _mm_mul_ps(_vhigh, _scale1);
-                            *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-            }
-            if (out_elempack == 1)
-            {
-                if (scale_data_size == 1)
-                {
-                    const float scale = scale_data[0];
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < h; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i);
-                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * scale);
-                            outptr1[0] = float2int8(ptr0[1] * scale);
-                            outptr2[0] = float2int8(ptr0[2] * scale);
-                            outptr3[0] = float2int8(ptr0[3] * scale);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int i = 0; i < h; i++)
-                    {
-                        const float* ptr0 = bottom_blob.row(i);
-                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
-                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
-                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
-                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
-
-                        const float s0 = scale_data[i * 4];
-                        const float s1 = scale_data[i * 4 + 1];
-                        const float s2 = scale_data[i * 4 + 2];
-                        const float s3 = scale_data[i * 4 + 3];
-
-                        for (int j = 0; j < w; j++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * s0);
-                            outptr1[0] = float2int8(ptr0[1] * s1);
-                            outptr2[0] = float2int8(ptr0[2] * s2);
-                            outptr3[0] = float2int8(ptr0[3] * s3);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-            }
-        }
-
-        if (dims == 3)
-        {
-            int w = bottom_blob.w;
-            int h = bottom_blob.h;
-            int channels = bottom_blob.c;
-            int size = w * h;
-            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
-            int outc = channels * elempack / out_elempack;
-
-            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
-            if (top_blob.empty())
-                return -100;
-
-            if (out_elempack == 8)
-            {
-                if (scale_data_size == 1)
-                {
-                    __m128 _scale = _mm_set1_ps(scale_data[0]);
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < outc; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q * 2);
-                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
-                        signed char* outptr = top_blob.channel(q);
-
-                        int i = 0;
-                        for (; i + 1 < size; i += 2)
-                        {
-                            __m128 _v0 = _mm_loadu_ps(ptr0);
-                            __m128 _v1 = _mm_loadu_ps(ptr1);
-                            __m128 _v2 = _mm_loadu_ps(ptr0 + 4);
-                            __m128 _v3 = _mm_loadu_ps(ptr1 + 4);
-                            _v0 = _mm_mul_ps(_v0, _scale);
-                            _v1 = _mm_mul_ps(_v1, _scale);
-                            _v2 = _mm_mul_ps(_v2, _scale);
-                            _v3 = _mm_mul_ps(_v3, _scale);
-                            __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3);
-                            _mm_storeu_si128((__m128i*)outptr, _v);
-
-                            ptr0 += 8;
-                            ptr1 += 8;
-                            outptr += 16;
-                        }
-                        for (; i < size; i++)
-                        {
-                            __m128 _vlow = _mm_loadu_ps(ptr0);
-                            __m128 _vhigh = _mm_loadu_ps(ptr1);
-                            _vlow = _mm_mul_ps(_vlow, _scale);
-                            _vhigh = _mm_mul_ps(_vhigh, _scale);
-                            *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < outc; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q * 2);
-                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
-                        signed char* outptr = top_blob.channel(q);
-
-                        __m128 _scale0 = _mm_loadu_ps((const float*)scale_data + q * 8);
-                        __m128 _scale1 = _mm_loadu_ps((const float*)scale_data + q * 8 + 4);
-
-                        int i = 0;
-                        for (; i + 1 < size; i += 2)
-                        {
-                            __m128 _v0 = _mm_loadu_ps(ptr0);
-                            __m128 _v1 = _mm_loadu_ps(ptr1);
-                            __m128 _v2 = _mm_loadu_ps(ptr0 + 4);
-                            __m128 _v3 = _mm_loadu_ps(ptr1 + 4);
-                            _v0 = _mm_mul_ps(_v0, _scale0);
-                            _v1 = _mm_mul_ps(_v1, _scale1);
-                            _v2 = _mm_mul_ps(_v2, _scale0);
-                            _v3 = _mm_mul_ps(_v3, _scale1);
-                            __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3);
-                            _mm_storeu_si128((__m128i*)outptr, _v);
-
-                            ptr0 += 8;
-                            ptr1 += 8;
-                            outptr += 16;
-                        }
-                        for (; i < size; i++)
-                        {
-                            __m128 _vlow = _mm_loadu_ps(ptr0);
-                            __m128 _vhigh = _mm_loadu_ps(ptr1);
-                            _vlow = _mm_mul_ps(_vlow, _scale0);
-                            _vhigh = _mm_mul_ps(_vhigh, _scale1);
-                            *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh);
-
-                            ptr0 += 4;
-                            ptr1 += 4;
-                            outptr += 8;
-                        }
-                    }
-                }
-            }
-            if (out_elempack == 1)
-            {
-                if (scale_data_size == 1)
-                {
-                    const float scale = scale_data[0];
-
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < channels; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q);
-                        signed char* outptr0 = top_blob.channel(q * 4);
-                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * scale);
-                            outptr1[0] = float2int8(ptr0[1] * scale);
-                            outptr2[0] = float2int8(ptr0[2] * scale);
-                            outptr3[0] = float2int8(ptr0[3] * scale);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-                else
-                {
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int q = 0; q < channels; q++)
-                    {
-                        const float* ptr0 = bottom_blob.channel(q);
-                        signed char* outptr0 = top_blob.channel(q * 4);
-                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
-                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
-                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
-
-                        const float s0 = scale_data[q * 4];
-                        const float s1 = scale_data[q * 4 + 1];
-                        const float s2 = scale_data[q * 4 + 2];
-                        const float s3 = scale_data[q * 4 + 3];
-
-                        for (int i = 0; i < size; i++)
-                        {
-                            outptr0[0] = float2int8(ptr0[0] * s0);
-                            outptr1[0] = float2int8(ptr0[1] * s1);
-                            outptr2[0] = float2int8(ptr0[2] * s2);
-                            outptr3[0] = float2int8(ptr0[3] * s3);
-
-                            ptr0 += 4;
-                            outptr0 += 1;
-                            outptr1 += 1;
-                            outptr2 += 1;
-                            outptr3 += 1;
-                        }
-                    }
-                }
-            }
-        }
-
-        return 0;
+#if __AVX__
+        __m256 _v = _mm256_loadu_ps(ptr);
+        _v = _mm256_mul_ps(_v, _scale_avx);
+        *(int64_t*)s8ptr = float2int8_avx(_v);
+#else  // __AVX__
+        __m128 _v0 = _mm_loadu_ps(ptr);
+        __m128 _v1 = _mm_loadu_ps(ptr + 4);
+        _v0 = _mm_mul_ps(_v0, _scale);
+        _v1 = _mm_mul_ps(_v1, _scale);
+        *(int64_t*)s8ptr = float2int8_sse(_v0, _v1);
+#endif // __AVX__
+        ptr += 8;
+        s8ptr += 8;
+    }
+    for (; i + 3 < size; i += 4)
+    {
+        __m128 _v = _mm_loadu_ps(ptr);
+        _v = _mm_mul_ps(_v, _scale);
+        int32_t v = float2int8_sse(_v);
+        s8ptr[0] = (v >> 0) & 0xff;
+        s8ptr[1] = (v >> 8) & 0xff;
+        s8ptr[2] = (v >> 16) & 0xff;
+        s8ptr[3] = (v >> 24) & 0xff;
+        ptr += 4;
+        s8ptr += 4;
     }
 #endif // __SSE2__
+    for (; i < size; i++)
+    {
+        float v = *ptr * scale;
+        *s8ptr = float2int8(v);
+        ptr++;
+        s8ptr++;
+    }
+}
+
+int Quantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int dims = bottom_blob.dims;
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+    const size_t out_elemsize = elempack * 1u;
 
     if (dims == 1)
     {
-        int w = bottom_blob.w;
-
-        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        const float* ptr = bottom_blob;
-        signed char* outptr = top_blob;
+        const int wp = std::max(1, w / opt.num_threads);
+        const int nn_w = (w + wp - 1) / wp;
 
-        if (scale_data_size == 1)
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_w; ii++)
         {
-            const float scale = scale_data[0];
+            const int i = ii * wp;
 
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8(ptr[i] * scale);
-            }
-        }
-        else
-        {
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < w; i++)
-            {
-                outptr[i] = float2int8(ptr[i] * scale_data[i]);
-            }
+            const float* ptr = (const float*)bottom_blob + i * elempack;
+            signed char* s8ptr = (signed char*)top_blob + i * elempack;
+
+            // assert scale_data_size == 1
+
+            const int size = std::min(w - i, wp) * elempack;
+
+            quantize(ptr, s8ptr, scale_data, size, 1);
         }
     }
 
     if (dims == 2)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-
-        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = 0; i < h; i++)
         {
-            const float* ptr0 = bottom_blob.row(i);
-            signed char* outptr0 = top_blob.row<signed char>(i);
+            const float* ptr = bottom_blob.row(i);
+            signed char* s8ptr = top_blob.row<signed char>(i);
 
-            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+            const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
 
-            for (int j = 0; j < w; j++)
-            {
-                *outptr0++ = float2int8(*ptr0++ * scale);
-            }
+            quantize(ptr, s8ptr, scale_data_i, w, elempack);
         }
     }
 
     if (dims == 3)
     {
-        int w = bottom_blob.w;
-        int h = bottom_blob.h;
-        int channels = bottom_blob.c;
-        int size = w * h;
-
-        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -706,14 +198,11 @@ int Quantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
         for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
-            signed char* outptr = top_blob.channel(q);
+            signed char* s8ptr = top_blob.channel(q);
 
-            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+            const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
 
-            for (int i = 0; i < size; i++)
-            {
-                *outptr++ = float2int8(*ptr++ * scale);
-            }
+            quantize(ptr, s8ptr, scale_data_q, w * h, elempack);
         }
     }
 
diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h
index 4a9d2f3739a..0398be80d4a 100644
--- a/src/layer/x86/x86_usability.h
+++ b/src/layer/x86/x86_usability.h
@@ -727,6 +727,7 @@ static NCNN_FORCEINLINE int64_t float2int8_avx(const __m256& _v0)
     __m256i _v0_i = _mm256_cvttps_epi32(_v0_adj);
 #if __AVX512F__
     __m128i _v8 = _mm256_cvtsepi32_epi8(_v0_i);
+    _v8 = _mm_max_epi8(_v8, _mm_set1_epi8(-127));
 #else // __AVX512F__
 #if __AVX2__
     __m256i _v01_s16 = _mm256_packs_epi32(_v0_i, _v0_i);
@@ -1457,7 +1458,9 @@ static NCNN_FORCEINLINE __m128i float2int8_avx512(const __m512& _v0)
     __m512 _v0_p5 = _mm512_or_ps(_p5, _sign);
     __m512 _v0_adj = _mm512_add_ps(_v0, _v0_p5);
     __m512i _v0_i = _mm512_cvttps_epi32(_v0_adj);
-    return _mm512_cvtsepi32_epi8(_v0_i);
+    __m128i _v8 = _mm512_cvtsepi32_epi8(_v0_i);
+    _v8 = _mm_max_epi8(_v8, _mm_set1_epi8(-127));
+    return _v8;
 }
 
 static NCNN_FORCEINLINE __m512 bfloat2float_avx512(const __m256i& v0)