From ff5b554003a0e0c1feec684a66629cc64d62dd46 Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 5 Feb 2025 17:27:50 +0800 Subject: [PATCH] restrict one dim quantize scale size, test quantize oom (#5892) * restrict one dim quantize scale size * sse2 requantize pack8 --- src/layer/arm/quantize_arm.cpp | 1336 +++++++++----------- src/layer/arm/quantize_arm_asimdhp.cpp | 1202 +++++++----------- src/layer/loongarch/quantize_loongarch.cpp | 624 ++++----- src/layer/mips/quantize_mips.cpp | 624 ++++----- src/layer/quantize.cpp | 64 +- src/layer/x86/quantize_x86.cpp | 994 ++++++--------- src/layer/x86/requantize_x86.cpp | 75 +- src/layer/x86/x86_usability.h | 5 +- tests/test_quantize_oom.cpp | 80 ++ 9 files changed, 2061 insertions(+), 2943 deletions(-) create mode 100644 tests/test_quantize_oom.cpp diff --git a/src/layer/arm/quantize_arm.cpp b/src/layer/arm/quantize_arm.cpp index 6e395a9bb76..18a07f0a324 100644 --- a/src/layer/arm/quantize_arm.cpp +++ b/src/layer/arm/quantize_arm.cpp @@ -39,6 +39,186 @@ Quantize_arm::Quantize_arm() #endif } +static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack) +{ + const int scale_data_size = scale_data.w; + const int size = elemcount * elempack; + + // NCNN_LOGE("quantize %d %d %d", scale_data_size, elemcount, elempack); + + float scale = scale_data[0]; +#if __ARM_NEON + float32x4_t _scale = vdupq_n_f32(scale); + if (scale_data_size > 1) + { + if (elempack == 4) + { + _scale = vld1q_f32((const float*)scale_data); + } + } +#endif // __ARM_NEON + + int i = 0; +#if __ARM_NEON + for (; i + 15 < size; i += 16) + { + float32x4_t _v0 = vld1q_f32(ptr); + float32x4_t _v1 = vld1q_f32(ptr + 4); + float32x4_t _v2 = vld1q_f32(ptr + 8); + float32x4_t _v3 = vld1q_f32(ptr + 12); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + _v2 = vmulq_f32(_v2, _scale); + _v3 = vmulq_f32(_v3, _scale); + vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3))); + ptr += 16; + s8ptr += 16; + } + for (; i + 7 < size; i += 8) + { + float32x4_t _v0 = vld1q_f32(ptr); + float32x4_t _v1 = vld1q_f32(ptr + 4); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + vst1_s8(s8ptr, float2int8(_v0, _v1)); + ptr += 8; + s8ptr += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _v = vld1q_f32(ptr); + _v = vmulq_f32(_v, _scale); + int8x8_t v = float2int8(_v, _v); + s8ptr[0] = vget_lane_s8(v, 0); + s8ptr[1] = vget_lane_s8(v, 1); + s8ptr[2] = vget_lane_s8(v, 2); + s8ptr[3] = vget_lane_s8(v, 3); + ptr += 4; + s8ptr += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = *ptr * scale; + *s8ptr = float2int8(v); + ptr++; + s8ptr++; + } +} + +#if __ARM_NEON +static void quantize_pack4to8(const float* ptr0, const float* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to8 %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + float32x4_t _scale0 = vdupq_n_f32(scale); + float32x4_t _scale1 = _scale0; + if (scale_data_size > 1) + { + _scale0 = vld1q_f32((const float*)scale_data); + _scale1 = vld1q_f32((const float*)scale_data + 4); + } + + int i = 0; + for (; i + 1 < elemcount; i += 2) + { + float32x4_t _v0 = vld1q_f32(ptr0); + float32x4_t _v1 = vld1q_f32(ptr1); + float32x4_t _v2 = vld1q_f32(ptr0 + 4); + float32x4_t _v3 = vld1q_f32(ptr1 + 4); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + _v2 = vmulq_f32(_v2, _scale0); + _v3 = vmulq_f32(_v3, _scale1); + vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3))); + ptr0 += 8; + ptr1 += 8; + s8ptr += 16; + } + for (; i < elemcount; i++) + { + float32x4_t _v0 = vld1q_f32(ptr0); + float32x4_t _v1 = vld1q_f32(ptr1); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + vst1_s8(s8ptr, float2int8(_v0, _v1)); + ptr0 += 4; + ptr1 += 4; + s8ptr += 8; + } +} + +static void quantize_pack4to1(const float* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to1 %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + float32x4_t _scale = vdupq_n_f32(scale); + if (scale_data_size > 1) + { + _scale = vld1q_f32((const float*)scale_data); + } + + int i = 0; + for (; i + 7 < elemcount; i += 8) + { + float32x4_t _v0 = vld1q_f32(ptr); + float32x4_t _v1 = vld1q_f32(ptr + 4); + float32x4_t _v2 = vld1q_f32(ptr + 8); + float32x4_t _v3 = vld1q_f32(ptr + 12); + float32x4_t _v4 = vld1q_f32(ptr + 16); + float32x4_t _v5 = vld1q_f32(ptr + 20); + float32x4_t _v6 = vld1q_f32(ptr + 24); + float32x4_t _v7 = vld1q_f32(ptr + 28); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + _v2 = vmulq_f32(_v2, _scale); + _v3 = vmulq_f32(_v3, _scale); + _v4 = vmulq_f32(_v4, _scale); + _v5 = vmulq_f32(_v5, _scale); + _v6 = vmulq_f32(_v6, _scale); + _v7 = vmulq_f32(_v7, _scale); + int8x8_t v0 = float2int8(_v0, _v1); + int8x8_t v1 = float2int8(_v2, _v3); + int8x8_t v2 = float2int8(_v4, _v5); + int8x8_t v3 = float2int8(_v6, _v7); + int8x16_t v01 = vcombine_s8(v0, v1); + int8x16_t v23 = vcombine_s8(v2, v3); + int8x16x2_t v0213 = vuzpq_s8(v01, v23); + int8x16x2_t v0123 = vuzpq_s8(v0213.val[0], v0213.val[1]); + vst1_s8(s8ptr0, vget_low_s8(v0123.val[0])); + vst1_s8(s8ptr1, vget_high_s8(v0123.val[0])); + vst1_s8(s8ptr2, vget_low_s8(v0123.val[1])); + vst1_s8(s8ptr3, vget_high_s8(v0123.val[1])); + ptr += 32; + s8ptr0 += 8; + s8ptr1 += 8; + s8ptr2 += 8; + s8ptr3 += 8; + } + for (; i < elemcount; i++) + { + float32x4_t _v = vld1q_f32(ptr); + _v = vmulq_f32(_v, _scale); + int8x8_t v = float2int8(_v, _v); + s8ptr0[0] = vget_lane_s8(v, 0); + s8ptr1[0] = vget_lane_s8(v, 1); + s8ptr2[0] = vget_lane_s8(v, 2); + s8ptr3[0] = vget_lane_s8(v, 3); + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; + } +} +#endif // __ARM_NEON + int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { int elembits = bottom_blob.elembits(); @@ -58,450 +238,169 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o return forward_bf16s(bottom_blob, top_blob, opt); #endif - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; -#if __ARM_NEON - if (elempack == 4) + if (dims == 1) { - if (dims == 1) + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) { - int w = bottom_blob.w; - int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; - int outw = w * elempack / out_elempack; + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const float* ptr0 = (const float*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8(ptr0[0] * scale); - outptr[1] = float2int8(ptr0[1] * scale); - outptr[2] = float2int8(ptr0[2] * scale); - outptr[3] = float2int8(ptr0[3] * scale); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const float* ptr0 = (const float*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]); - outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]); - outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]); - outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]); - } - } - } + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (dims == 2) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; - int outh = h * elempack / out_elempack; + const int i = ii * wp; - top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const float* ptr = (const float*)bottom_blob + i * elempack; + signed char* s8ptr = (signed char*)top_blob + i * elempack; - if (out_elempack == 8) - { - if (scale_data_size == 1) - { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const float* ptr0 = bottom_blob.row(i * 2); - const float* ptr1 = bottom_blob.row(i * 2 + 1); - signed char* outptr = top_blob.row(i); - - for (int j = 0; j < w; j++) - { - float32x4_t _vlow = vld1q_f32(ptr0); - float32x4_t _vhigh = vld1q_f32(ptr1); - _vlow = vmulq_f32(_vlow, _scale); - _vhigh = vmulq_f32(_vhigh, _scale); - int8x8_t _v = float2int8(_vlow, _vhigh); - vst1_s8(outptr, _v); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const float* ptr0 = bottom_blob.row(i * 2); - const float* ptr1 = bottom_blob.row(i * 2 + 1); - signed char* outptr = top_blob.row(i); - - float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _vlow = vld1q_f32(ptr0); - float32x4_t _vhigh = vld1q_f32(ptr1); - _vlow = vmulq_f32(_vlow, _scale0); - _vhigh = vmulq_f32(_vhigh, _scale1); - int8x8_t _v = float2int8(_vlow, _vhigh); - vst1_s8(outptr, _v); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8(ptr0[0] * scale); - outptr1[0] = float2int8(ptr0[1] * scale); - outptr2[0] = float2int8(ptr0[2] * scale); - outptr3[0] = float2int8(ptr0[3] * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - const float s0 = scale_data[i * 4]; - const float s1 = scale_data[i * 4 + 1]; - const float s2 = scale_data[i * 4 + 2]; - const float s3 = scale_data[i * 4 + 3]; - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8(ptr0[0] * s0); - outptr1[0] = float2int8(ptr0[1] * s1); - outptr2[0] = float2int8(ptr0[2] * s2); - outptr3[0] = float2int8(ptr0[3] * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } - } + // assert scale_data_size == 1 - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; - int outc = channels * elempack / out_elempack; - - top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == 8) - { - if (scale_data_size == 1) - { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const float* ptr0 = bottom_blob.channel(q * 2); - const float* ptr1 = bottom_blob.channel(q * 2 + 1); - signed char* outptr = top_blob.channel(q); - - int i = 0; - for (; i + 1 < size; i += 2) - { - float32x4_t _v0 = vld1q_f32(ptr0); - float32x4_t _v1 = vld1q_f32(ptr0 + 4); - float32x4_t _v2 = vld1q_f32(ptr1); - float32x4_t _v3 = vld1q_f32(ptr1 + 4); - _v0 = vmulq_f32(_v0, _scale); - _v1 = vmulq_f32(_v1, _scale); - _v2 = vmulq_f32(_v2, _scale); - _v3 = vmulq_f32(_v3, _scale); - vst1_s8(outptr, float2int8(_v0, _v2)); - vst1_s8(outptr + 8, float2int8(_v1, _v3)); - - ptr0 += 8; - ptr1 += 8; - outptr += 16; - } - for (; i < size; i++) - { - float32x4_t _vlow = vld1q_f32(ptr0); - float32x4_t _vhigh = vld1q_f32(ptr1); - _vlow = vmulq_f32(_vlow, _scale); - _vhigh = vmulq_f32(_vhigh, _scale); - int8x8_t _v = float2int8(_vlow, _vhigh); - vst1_s8(outptr, _v); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const float* ptr0 = bottom_blob.channel(q * 2); - const float* ptr1 = bottom_blob.channel(q * 2 + 1); - signed char* outptr = top_blob.channel(q); - - float32x4_t _scale0 = vld1q_f32((const float*)scale_data + q * 8); - float32x4_t _scale1 = vld1q_f32((const float*)scale_data + q * 8 + 4); - - int i = 0; - for (; i < size; i++) - { - float32x4_t _vlow = vld1q_f32(ptr0); - float32x4_t _vhigh = vld1q_f32(ptr1); - _vlow = vmulq_f32(_vlow, _scale0); - _vhigh = vmulq_f32(_vhigh, _scale1); - int8x8_t _v = float2int8(_vlow, _vhigh); - vst1_s8(outptr, _v); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8(ptr0[0] * scale); - outptr1[0] = float2int8(ptr0[1] * scale); - outptr2[0] = float2int8(ptr0[2] * scale); - outptr3[0] = float2int8(ptr0[3] * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - const float s0 = scale_data[q * 4]; - const float s1 = scale_data[q * 4 + 1]; - const float s2 = scale_data[q * 4 + 2]; - const float s3 = scale_data[q * 4 + 3]; - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8(ptr0[0] * s0); - outptr1[0] = float2int8(ptr0[1] * s1); - outptr2[0] = float2int8(ptr0[2] * s2); - outptr3[0] = float2int8(ptr0[3] * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } - } + const int size = std::min(w - i, wp) * elempack; - return 0; + quantize(ptr, s8ptr, scale_data, size, 1); + } } -#endif // __ARM_NEON - if (dims == 1) + if (dims == 2) { - int w = bottom_blob.w; + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, (size_t)1u, opt.blob_allocator); + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - const float* ptr = bottom_blob; - signed char* outptr = top_blob; - - if (scale_data_size == 1) +#if __ARM_NEON + if (elempack == 4 && out_elempack == 8) { - const float scale = scale_data[0]; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) + for (int i = 0; i < outh; i++) { - outptr[i] = float2int8(ptr[i] * scale); + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data; + + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_i, w); } } - else + if (elempack == 4 && out_elempack == 1) { #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) + for (int i = 0; i < h; i++) { - outptr[i] = float2int8(ptr[i] * scale_data[i]); - } - } - } + const float* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); + } + } +#endif // __ARM_NEON + if (elempack == out_elempack) { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; - for (int j = 0; j < w; j++) - { - *outptr0++ = float2int8(*ptr0++ * scale); + quantize(ptr, s8ptr, scale_data_i, w, elempack); } } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __ARM_NEON + if (elempack == 4 && out_elempack == 8) { - const float* ptr = bottom_blob.channel(q); - signed char* outptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* s8ptr = top_blob.channel(q); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data; - int i = 0; -#if __ARM_NEON - float32x4_t _scale = vdupq_n_f32(scale); - for (; i + 15 < size; i += 16) - { - float32x4_t _v0 = vld1q_f32(ptr); - float32x4_t _v1 = vld1q_f32(ptr + 4); - float32x4_t _v2 = vld1q_f32(ptr + 8); - float32x4_t _v3 = vld1q_f32(ptr + 12); - _v0 = vmulq_f32(_v0, _scale); - _v1 = vmulq_f32(_v1, _scale); - _v2 = vmulq_f32(_v2, _scale); - _v3 = vmulq_f32(_v3, _scale); - vst1_s8(outptr, float2int8(_v0, _v1)); - vst1_s8(outptr + 8, float2int8(_v2, _v3)); - - ptr += 16; - outptr += 16; + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_q, w * h); } - for (; i + 7 < size; i += 8) + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - float32x4_t _v0 = vld1q_f32(ptr); - float32x4_t _v1 = vld1q_f32(ptr + 4); - _v0 = vmulq_f32(_v0, _scale); - _v1 = vmulq_f32(_v1, _scale); - int8x8_t _v = float2int8(_v0, _v1); - vst1_s8(outptr, _v); - - ptr += 8; - outptr += 8; + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); } + } #endif // __ARM_NEON - for (; i < size; i++) + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - *outptr++ = float2int8(*ptr++ * scale); + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize(ptr, s8ptr, scale_data_q, w * h, elempack); } } } @@ -510,401 +409,360 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o } #if NCNN_BF16 -int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void quantize_bf16s(const unsigned short* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + const int scale_data_size = scale_data.w; + const int size = elemcount * elempack; + // NCNN_LOGE("quantize_bf16s %d %d %d", scale_data_size, elemcount, elempack); + + float scale = scale_data[0]; #if __ARM_NEON - if (elempack == 4) + float32x4_t _scale = vdupq_n_f32(scale); + if (scale_data_size > 1) { - if (dims == 1) + if (elempack == 4) { - int w = bottom_blob.w; - int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; - int outw = w * elempack / out_elempack; + _scale = vld1q_f32((const float*)scale_data); + } + } +#endif // __ARM_NEON - top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + int i = 0; +#if __ARM_NEON + for (; i + 15 < size; i += 16) + { + uint16x8_t _v01 = vld1q_u16(ptr); + uint16x8_t _v23 = vld1q_u16(ptr + 8); + float32x4_t _v0 = bfloat2float(vget_low_u16(_v01)); + float32x4_t _v1 = bfloat2float(vget_high_u16(_v01)); + float32x4_t _v2 = bfloat2float(vget_low_u16(_v23)); + float32x4_t _v3 = bfloat2float(vget_high_u16(_v23)); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + _v2 = vmulq_f32(_v2, _scale); + _v3 = vmulq_f32(_v3, _scale); + vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3))); + ptr += 16; + s8ptr += 16; + } + for (; i + 7 < size; i += 8) + { + uint16x8_t _v01 = vld1q_u16(ptr); + float32x4_t _v0 = bfloat2float(vget_low_u16(_v01)); + float32x4_t _v1 = bfloat2float(vget_high_u16(_v01)); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + vst1_s8(s8ptr, float2int8(_v0, _v1)); + ptr += 8; + s8ptr += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _v = bfloat2float(vld1_u16(ptr)); + _v = vmulq_f32(_v, _scale); + int8x8_t v = float2int8(_v, _v); + s8ptr[0] = vget_lane_s8(v, 0); + s8ptr[1] = vget_lane_s8(v, 1); + s8ptr[2] = vget_lane_s8(v, 2); + s8ptr[3] = vget_lane_s8(v, 3); + ptr += 4; + s8ptr += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = bfloat16_to_float32(*ptr) * scale; + *s8ptr = float2int8(v); + ptr++; + s8ptr++; + } +} - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const unsigned short* ptr0 = (const unsigned short*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8(bfloat16_to_float32(ptr0[0]) * scale); - outptr[1] = float2int8(bfloat16_to_float32(ptr0[1]) * scale); - outptr[2] = float2int8(bfloat16_to_float32(ptr0[2]) * scale); - outptr[3] = float2int8(bfloat16_to_float32(ptr0[3]) * scale); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const unsigned short* ptr0 = (const unsigned short*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8(bfloat16_to_float32(ptr0[0]) * scale_data[i * 4]); - outptr[1] = float2int8(bfloat16_to_float32(ptr0[1]) * scale_data[i * 4 + 1]); - outptr[2] = float2int8(bfloat16_to_float32(ptr0[2]) * scale_data[i * 4 + 2]); - outptr[3] = float2int8(bfloat16_to_float32(ptr0[3]) * scale_data[i * 4 + 3]); - } - } - } +#if __ARM_NEON +static void quantize_pack4to8_bf16s(const unsigned short* ptr0, const unsigned short* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; - int outh = h * elempack / out_elempack; + // NCNN_LOGE("quantize_pack4to8_bf16s %d %d", scale_data_size, elemcount); - top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + float scale = scale_data[0]; + float32x4_t _scale0 = vdupq_n_f32(scale); + float32x4_t _scale1 = _scale0; + if (scale_data_size > 1) + { + _scale0 = vld1q_f32((const float*)scale_data); + _scale1 = vld1q_f32((const float*)scale_data + 4); + } - if (out_elempack == 8) - { - if (scale_data_size == 1) - { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const unsigned short* ptr0 = bottom_blob.row(i * 2); - const unsigned short* ptr1 = bottom_blob.row(i * 2 + 1); - signed char* outptr = top_blob.row(i); - - for (int j = 0; j < w; j++) - { - float32x4_t _vlow = bfloat2float(vld1_u16(ptr0)); - float32x4_t _vhigh = bfloat2float(vld1_u16(ptr1)); - _vlow = vmulq_f32(_vlow, _scale); - _vhigh = vmulq_f32(_vhigh, _scale); - int8x8_t _v = float2int8(_vlow, _vhigh); - vst1_s8(outptr, _v); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const unsigned short* ptr0 = bottom_blob.row(i * 2); - const unsigned short* ptr1 = bottom_blob.row(i * 2 + 1); - signed char* outptr = top_blob.row(i); - - float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _vlow = bfloat2float(vld1_u16(ptr0)); - float32x4_t _vhigh = bfloat2float(vld1_u16(ptr1)); - _vlow = vmulq_f32(_vlow, _scale0); - _vhigh = vmulq_f32(_vhigh, _scale1); - int8x8_t _v = float2int8(_vlow, _vhigh); - vst1_s8(outptr, _v); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const unsigned short* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8(bfloat16_to_float32(ptr0[0]) * scale); - outptr1[0] = float2int8(bfloat16_to_float32(ptr0[1]) * scale); - outptr2[0] = float2int8(bfloat16_to_float32(ptr0[2]) * scale); - outptr3[0] = float2int8(bfloat16_to_float32(ptr0[3]) * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const unsigned short* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - const float s0 = scale_data[i * 4]; - const float s1 = scale_data[i * 4 + 1]; - const float s2 = scale_data[i * 4 + 2]; - const float s3 = scale_data[i * 4 + 3]; - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8(bfloat16_to_float32(ptr0[0]) * s0); - outptr1[0] = float2int8(bfloat16_to_float32(ptr0[1]) * s1); - outptr2[0] = float2int8(bfloat16_to_float32(ptr0[2]) * s2); - outptr3[0] = float2int8(bfloat16_to_float32(ptr0[3]) * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } - } + int i = 0; + for (; i + 1 < elemcount; i += 2) + { + uint16x8_t _v02 = vld1q_u16(ptr0); + uint16x8_t _v13 = vld1q_u16(ptr1); + float32x4_t _v0 = bfloat2float(vget_low_u16(_v02)); + float32x4_t _v1 = bfloat2float(vget_low_u16(_v13)); + float32x4_t _v2 = bfloat2float(vget_high_u16(_v02)); + float32x4_t _v3 = bfloat2float(vget_high_u16(_v13)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + _v2 = vmulq_f32(_v2, _scale0); + _v3 = vmulq_f32(_v3, _scale1); + vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3))); + ptr0 += 8; + ptr1 += 8; + s8ptr += 16; + } + for (; i < elemcount; i++) + { + float32x4_t _v0 = bfloat2float(vld1_u16(ptr0)); + float32x4_t _v1 = bfloat2float(vld1_u16(ptr1)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + vst1_s8(s8ptr, float2int8(_v0, _v1)); + ptr0 += 4; + ptr1 += 4; + s8ptr += 8; + } +} - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; - int outc = channels * elempack / out_elempack; - - top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == 8) - { - if (scale_data_size == 1) - { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const unsigned short* ptr0 = bottom_blob.channel(q * 2); - const unsigned short* ptr1 = bottom_blob.channel(q * 2 + 1); - signed char* outptr = top_blob.channel(q); - - for (int i = 0; i < size; i++) - { - float32x4_t _vlow = bfloat2float(vld1_u16(ptr0)); - float32x4_t _vhigh = bfloat2float(vld1_u16(ptr1)); - _vlow = vmulq_f32(_vlow, _scale); - _vhigh = vmulq_f32(_vhigh, _scale); - int8x8_t _v = float2int8(_vlow, _vhigh); - vst1_s8(outptr, _v); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const unsigned short* ptr0 = bottom_blob.channel(q * 2); - const unsigned short* ptr1 = bottom_blob.channel(q * 2 + 1); - signed char* outptr = top_blob.channel(q); - - float32x4_t _scale0 = vld1q_f32((const float*)scale_data + q * 8); - float32x4_t _scale1 = vld1q_f32((const float*)scale_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _vlow = bfloat2float(vld1_u16(ptr0)); - float32x4_t _vhigh = bfloat2float(vld1_u16(ptr1)); - _vlow = vmulq_f32(_vlow, _scale0); - _vhigh = vmulq_f32(_vhigh, _scale1); - int8x8_t _v = float2int8(_vlow, _vhigh); - vst1_s8(outptr, _v); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const unsigned short* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8(bfloat16_to_float32(ptr0[0]) * scale); - outptr1[0] = float2int8(bfloat16_to_float32(ptr0[1]) * scale); - outptr2[0] = float2int8(bfloat16_to_float32(ptr0[2]) * scale); - outptr3[0] = float2int8(bfloat16_to_float32(ptr0[3]) * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const unsigned short* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - const float s0 = scale_data[q * 4]; - const float s1 = scale_data[q * 4 + 1]; - const float s2 = scale_data[q * 4 + 2]; - const float s3 = scale_data[q * 4 + 3]; - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8(bfloat16_to_float32(ptr0[0]) * s0); - outptr1[0] = float2int8(bfloat16_to_float32(ptr0[1]) * s1); - outptr2[0] = float2int8(bfloat16_to_float32(ptr0[2]) * s2); - outptr3[0] = float2int8(bfloat16_to_float32(ptr0[3]) * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } - } +static void quantize_pack4to1_bf16s(const unsigned short* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to1_bf16s %d %d", scale_data_size, elemcount); - return 0; + float scale = scale_data[0]; + float32x4_t _scale = vdupq_n_f32(scale); + if (scale_data_size > 1) + { + _scale = vld1q_f32((const float*)scale_data); + } + + int i = 0; + for (; i + 7 < elemcount; i += 8) + { + uint16x8_t _v01 = vld1q_u16(ptr); + uint16x8_t _v23 = vld1q_u16(ptr + 8); + uint16x8_t _v45 = vld1q_u16(ptr + 16); + uint16x8_t _v67 = vld1q_u16(ptr + 24); + float32x4_t _v0 = bfloat2float(vget_low_u16(_v01)); + float32x4_t _v1 = bfloat2float(vget_high_u16(_v01)); + float32x4_t _v2 = bfloat2float(vget_low_u16(_v23)); + float32x4_t _v3 = bfloat2float(vget_high_u16(_v23)); + float32x4_t _v4 = bfloat2float(vget_low_u16(_v45)); + float32x4_t _v5 = bfloat2float(vget_high_u16(_v45)); + float32x4_t _v6 = bfloat2float(vget_low_u16(_v67)); + float32x4_t _v7 = bfloat2float(vget_high_u16(_v67)); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + _v2 = vmulq_f32(_v2, _scale); + _v3 = vmulq_f32(_v3, _scale); + _v4 = vmulq_f32(_v4, _scale); + _v5 = vmulq_f32(_v5, _scale); + _v6 = vmulq_f32(_v6, _scale); + _v7 = vmulq_f32(_v7, _scale); + int8x8_t v0 = float2int8(_v0, _v1); + int8x8_t v1 = float2int8(_v2, _v3); + int8x8_t v2 = float2int8(_v4, _v5); + int8x8_t v3 = float2int8(_v6, _v7); + int8x16_t v01 = vcombine_s8(v0, v1); + int8x16_t v23 = vcombine_s8(v2, v3); + int8x16x2_t v0213 = vuzpq_s8(v01, v23); + int8x16x2_t v0123 = vuzpq_s8(v0213.val[0], v0213.val[1]); + vst1_s8(s8ptr0, vget_low_s8(v0123.val[0])); + vst1_s8(s8ptr1, vget_high_s8(v0123.val[0])); + vst1_s8(s8ptr2, vget_low_s8(v0123.val[1])); + vst1_s8(s8ptr3, vget_high_s8(v0123.val[1])); + ptr += 32; + s8ptr0 += 8; + s8ptr1 += 8; + s8ptr2 += 8; + s8ptr3 += 8; + } + for (; i < elemcount; i++) + { + float32x4_t _v = bfloat2float(vld1_u16(ptr)); + _v = vmulq_f32(_v, _scale); + int8x8_t v = float2int8(_v, _v); + s8ptr0[0] = vget_lane_s8(v, 0); + s8ptr1[0] = vget_lane_s8(v, 1); + s8ptr2[0] = vget_lane_s8(v, 2); + s8ptr3[0] = vget_lane_s8(v, 3); + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; } +} #endif // __ARM_NEON +int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + if (dims == 1) { - int w = bottom_blob.w; + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, (size_t)1u, opt.blob_allocator); + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - const unsigned short* ptr = bottom_blob; - signed char* outptr = top_blob; + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (scale_data_size == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - const float scale = scale_data[0]; + const int i = ii * wp; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - outptr[i] = float2int8(bfloat16_to_float32(ptr[i]) * scale); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - outptr[i] = float2int8(bfloat16_to_float32(ptr[i]) * scale_data[i]); - } + const unsigned short* ptr = (const unsigned short*)bottom_blob + i * elempack; + signed char* s8ptr = (signed char*)top_blob + i * elempack; + + // assert scale_data_size == 1 + + const int size = std::min(w - i, wp) * elempack; + + quantize_bf16s(ptr, s8ptr, scale_data, size, 1); } } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) +#if __ARM_NEON + if (elempack == 4 && out_elempack == 8) { - const unsigned short* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const unsigned short* ptr0 = bottom_blob.row(i * 2); + const unsigned short* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* s8ptr = top_blob.row(i); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data; - for (int j = 0; j < w; j++) + quantize_pack4to8_bf16s(ptr0, ptr1, s8ptr, scale_data_i, w); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - *outptr0++ = float2int8(bfloat16_to_float32(*ptr0++) * scale); + const unsigned short* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_pack4to1_bf16s(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); + } + } +#endif // __ARM_NEON + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const unsigned short* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_bf16s(ptr, s8ptr, scale_data_i, w, elempack); } } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __ARM_NEON + if (elempack == 4 && out_elempack == 8) { - const unsigned short* ptr = bottom_blob.channel(q); - signed char* outptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const unsigned short* ptr0 = bottom_blob.channel(q * 2); + const unsigned short* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* s8ptr = top_blob.channel(q); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data; - for (int i = 0; i < size; i++) + quantize_pack4to8_bf16s(ptr0, ptr1, s8ptr, scale_data_q, w * h); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const unsigned short* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_pack4to1_bf16s(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); + } + } +#endif // __ARM_NEON + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - *outptr++ = float2int8(bfloat16_to_float32(*ptr++) * scale); + const unsigned short* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_bf16s(ptr, s8ptr, scale_data_q, w * h, elempack); } } } diff --git a/src/layer/arm/quantize_arm_asimdhp.cpp b/src/layer/arm/quantize_arm_asimdhp.cpp index 661f06c19cd..3c3bcb836ec 100644 --- a/src/layer/arm/quantize_arm_asimdhp.cpp +++ b/src/layer/arm/quantize_arm_asimdhp.cpp @@ -23,399 +23,344 @@ namespace ncnn { #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -int Quantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void quantize_fp16s(const __fp16* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + const int scale_data_size = scale_data.w; + const int size = elemcount * elempack; + + // NCNN_LOGE("quantize_fp16s %d %d %d", scale_data_size, elemcount, elempack); - if (elempack == 4) + float scale = scale_data[0]; + float32x4_t _scale = vdupq_n_f32(scale); + if (scale_data_size > 1) { - if (dims == 1) + if (elempack == 4) { - int w = bottom_blob.w; - int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; - int outw = w * elempack / out_elempack; + _scale = vld1q_f32((const float*)scale_data); + } + } - top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + int i = 0; + for (; i + 15 < size; i += 16) + { + float16x8_t _v01 = vld1q_f16(ptr); + float16x8_t _v23 = vld1q_f16(ptr + 8); + float32x4_t _v0 = vcvt_f32_f16(vget_low_f16(_v01)); + float32x4_t _v1 = vcvt_f32_f16(vget_high_f16(_v01)); + float32x4_t _v2 = vcvt_f32_f16(vget_low_f16(_v23)); + float32x4_t _v3 = vcvt_f32_f16(vget_high_f16(_v23)); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + _v2 = vmulq_f32(_v2, _scale); + _v3 = vmulq_f32(_v3, _scale); + vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3))); + ptr += 16; + s8ptr += 16; + } + for (; i + 7 < size; i += 8) + { + float16x8_t _v01 = vld1q_f16(ptr); + float32x4_t _v0 = vcvt_f32_f16(vget_low_f16(_v01)); + float32x4_t _v1 = vcvt_f32_f16(vget_high_f16(_v01)); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + vst1_s8(s8ptr, float2int8(_v0, _v1)); + ptr += 8; + s8ptr += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _v = vcvt_f32_f16(vld1_f16(ptr)); + _v = vmulq_f32(_v, _scale); + int8x8_t v = float2int8(_v, _v); + s8ptr[0] = vget_lane_s8(v, 0); + s8ptr[1] = vget_lane_s8(v, 1); + s8ptr[2] = vget_lane_s8(v, 2); + s8ptr[3] = vget_lane_s8(v, 3); + ptr += 4; + s8ptr += 4; + } + for (; i < size; i++) + { + float v = (float)(*ptr) * scale; + *s8ptr = float2int8(v); + ptr++; + s8ptr++; + } +} - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8((float)ptr0[0] * scale); - outptr[1] = float2int8((float)ptr0[1] * scale); - outptr[2] = float2int8((float)ptr0[2] * scale); - outptr[3] = float2int8((float)ptr0[3] * scale); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8((float)ptr0[0] * scale_data[i * 4]); - outptr[1] = float2int8((float)ptr0[1] * scale_data[i * 4 + 1]); - outptr[2] = float2int8((float)ptr0[2] * scale_data[i * 4 + 2]); - outptr[3] = float2int8((float)ptr0[3] * scale_data[i * 4 + 3]); - } - } - } +static void quantize_pack4to8_fp16s(const __fp16* ptr0, const __fp16* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; - int outh = h * elempack / out_elempack; + // NCNN_LOGE("quantize_pack4to8_fp16s %d %d", scale_data_size, elemcount); - top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + float scale = scale_data[0]; + float32x4_t _scale0 = vdupq_n_f32(scale); + float32x4_t _scale1 = _scale0; + if (scale_data_size > 1) + { + _scale0 = vld1q_f32((const float*)scale_data); + _scale1 = vld1q_f32((const float*)scale_data + 4); + } - if (out_elempack == 8) - { - if (scale_data_size == 1) - { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const __fp16* ptr0 = bottom_blob.row(i * 2); - const __fp16* ptr1 = bottom_blob.row(i * 2 + 1); - signed char* outptr = top_blob.row(i); - - for (int j = 0; j < w; j++) - { - float32x4_t _vlow = vcvt_f32_f16(vld1_f16(ptr0)); - float32x4_t _vhigh = vcvt_f32_f16(vld1_f16(ptr1)); - _vlow = vmulq_f32(_vlow, _scale); - _vhigh = vmulq_f32(_vhigh, _scale); - int8x8_t _v = float2int8(_vlow, _vhigh); - vst1_s8(outptr, _v); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const __fp16* ptr0 = bottom_blob.row(i * 2); - const __fp16* ptr1 = bottom_blob.row(i * 2 + 1); - signed char* outptr = top_blob.row(i); - - float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _vlow = vcvt_f32_f16(vld1_f16(ptr0)); - float32x4_t _vhigh = vcvt_f32_f16(vld1_f16(ptr1)); - _vlow = vmulq_f32(_vlow, _scale0); - _vhigh = vmulq_f32(_vhigh, _scale1); - int8x8_t _v = float2int8(_vlow, _vhigh); - vst1_s8(outptr, _v); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const __fp16* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8((float)ptr0[0] * scale); - outptr1[0] = float2int8((float)ptr0[1] * scale); - outptr2[0] = float2int8((float)ptr0[2] * scale); - outptr3[0] = float2int8((float)ptr0[3] * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const __fp16* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - const float s0 = scale_data[i * 4]; - const float s1 = scale_data[i * 4 + 1]; - const float s2 = scale_data[i * 4 + 2]; - const float s3 = scale_data[i * 4 + 3]; - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8((float)ptr0[0] * s0); - outptr1[0] = float2int8((float)ptr0[1] * s1); - outptr2[0] = float2int8((float)ptr0[2] * s2); - outptr3[0] = float2int8((float)ptr0[3] * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } - } + int i = 0; + for (; i + 1 < elemcount; i += 2) + { + float16x8_t _v02 = vld1q_f16(ptr0); + float16x8_t _v13 = vld1q_f16(ptr1); + float32x4_t _v0 = vcvt_f32_f16(vget_low_f16(_v02)); + float32x4_t _v1 = vcvt_f32_f16(vget_low_f16(_v13)); + float32x4_t _v2 = vcvt_f32_f16(vget_high_f16(_v02)); + float32x4_t _v3 = vcvt_f32_f16(vget_high_f16(_v13)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + _v2 = vmulq_f32(_v2, _scale0); + _v3 = vmulq_f32(_v3, _scale1); + vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3))); + ptr0 += 8; + ptr1 += 8; + s8ptr += 16; + } + for (; i < elemcount; i++) + { + float32x4_t _v0 = vcvt_f32_f16(vld1_f16(ptr0)); + float32x4_t _v1 = vcvt_f32_f16(vld1_f16(ptr1)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + vst1_s8(s8ptr, float2int8(_v0, _v1)); + ptr0 += 4; + ptr1 += 4; + s8ptr += 8; + } +} - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; - int outc = channels * elempack / out_elempack; - - top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == 8) - { - if (scale_data_size == 1) - { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const __fp16* ptr0 = bottom_blob.channel(q * 2); - const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1); - signed char* outptr = top_blob.channel(q); - - for (int i = 0; i < size; i++) - { - float32x4_t _vlow = vcvt_f32_f16(vld1_f16(ptr0)); - float32x4_t _vhigh = vcvt_f32_f16(vld1_f16(ptr1)); - _vlow = vmulq_f32(_vlow, _scale); - _vhigh = vmulq_f32(_vhigh, _scale); - int8x8_t _v = float2int8(_vlow, _vhigh); - vst1_s8(outptr, _v); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const __fp16* ptr0 = bottom_blob.channel(q * 2); - const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1); - signed char* outptr = top_blob.channel(q); - - float32x4_t _scale0 = vld1q_f32((const float*)scale_data + q * 8); - float32x4_t _scale1 = vld1q_f32((const float*)scale_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _vlow = vcvt_f32_f16(vld1_f16(ptr0)); - float32x4_t _vhigh = vcvt_f32_f16(vld1_f16(ptr1)); - _vlow = vmulq_f32(_vlow, _scale0); - _vhigh = vmulq_f32(_vhigh, _scale1); - int8x8_t _v = float2int8(_vlow, _vhigh); - vst1_s8(outptr, _v); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8((float)ptr0[0] * scale); - outptr1[0] = float2int8((float)ptr0[1] * scale); - outptr2[0] = float2int8((float)ptr0[2] * scale); - outptr3[0] = float2int8((float)ptr0[3] * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - const float s0 = scale_data[q * 4]; - const float s1 = scale_data[q * 4 + 1]; - const float s2 = scale_data[q * 4 + 2]; - const float s3 = scale_data[q * 4 + 3]; - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8((float)ptr0[0] * s0); - outptr1[0] = float2int8((float)ptr0[1] * s1); - outptr2[0] = float2int8((float)ptr0[2] * s2); - outptr3[0] = float2int8((float)ptr0[3] * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } - } +static void quantize_pack4to1_fp16s(const __fp16* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to1_fp16s %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + float32x4_t _scale = vdupq_n_f32(scale); + if (scale_data_size > 1) + { + _scale = vld1q_f32((const float*)scale_data); + } - return 0; + int i = 0; + for (; i + 7 < elemcount; i += 8) + { + float16x8_t _v01 = vld1q_f16(ptr); + float16x8_t _v23 = vld1q_f16(ptr + 8); + float16x8_t _v45 = vld1q_f16(ptr + 16); + float16x8_t _v67 = vld1q_f16(ptr + 24); + float32x4_t _v0 = vcvt_f32_f16(vget_low_f16(_v01)); + float32x4_t _v1 = vcvt_f32_f16(vget_high_f16(_v01)); + float32x4_t _v2 = vcvt_f32_f16(vget_low_f16(_v23)); + float32x4_t _v3 = vcvt_f32_f16(vget_high_f16(_v23)); + float32x4_t _v4 = vcvt_f32_f16(vget_low_f16(_v45)); + float32x4_t _v5 = vcvt_f32_f16(vget_high_f16(_v45)); + float32x4_t _v6 = vcvt_f32_f16(vget_low_f16(_v67)); + float32x4_t _v7 = vcvt_f32_f16(vget_high_f16(_v67)); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + _v2 = vmulq_f32(_v2, _scale); + _v3 = vmulq_f32(_v3, _scale); + _v4 = vmulq_f32(_v4, _scale); + _v5 = vmulq_f32(_v5, _scale); + _v6 = vmulq_f32(_v6, _scale); + _v7 = vmulq_f32(_v7, _scale); + int8x8_t v0 = float2int8(_v0, _v1); + int8x8_t v1 = float2int8(_v2, _v3); + int8x8_t v2 = float2int8(_v4, _v5); + int8x8_t v3 = float2int8(_v6, _v7); + int8x16_t v01 = vcombine_s8(v0, v1); + int8x16_t v23 = vcombine_s8(v2, v3); + int8x16x2_t v0213 = vuzpq_s8(v01, v23); + int8x16x2_t v0123 = vuzpq_s8(v0213.val[0], v0213.val[1]); + vst1_s8(s8ptr0, vget_low_s8(v0123.val[0])); + vst1_s8(s8ptr1, vget_high_s8(v0123.val[0])); + vst1_s8(s8ptr2, vget_low_s8(v0123.val[1])); + vst1_s8(s8ptr3, vget_high_s8(v0123.val[1])); + ptr += 32; + s8ptr0 += 8; + s8ptr1 += 8; + s8ptr2 += 8; + s8ptr3 += 8; + } + for (; i < elemcount; i++) + { + float32x4_t _v = vcvt_f32_f16(vld1_f16(ptr)); + _v = vmulq_f32(_v, _scale); + int8x8_t v = float2int8(_v, _v); + s8ptr0[0] = vget_lane_s8(v, 0); + s8ptr1[0] = vget_lane_s8(v, 1); + s8ptr2[0] = vget_lane_s8(v, 2); + s8ptr3[0] = vget_lane_s8(v, 3); + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; } +} + +int Quantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; if (dims == 1) { - int w = bottom_blob.w; + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, (size_t)1u, opt.blob_allocator); + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - const __fp16* ptr = bottom_blob; - signed char* outptr = top_blob; + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (scale_data_size == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - const float scale = scale_data[0]; + const int i = ii * wp; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - outptr[i] = float2int8((float)ptr[i] * scale); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - outptr[i] = float2int8((float)ptr[i] * scale_data[i]); - } + const __fp16* ptr = (const __fp16*)bottom_blob + i * elempack; + signed char* s8ptr = (signed char*)top_blob + i * elempack; + + // assert scale_data_size == 1 + + const int size = std::min(w - i, wp) * elempack; + + quantize_fp16s(ptr, s8ptr, scale_data, size, 1); } } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const __fp16* ptr0 = bottom_blob.row(i * 2); + const __fp16* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data; + + quantize_pack4to8_fp16s(ptr0, ptr1, s8ptr, scale_data_i, w); + } + } + if (elempack == 4 && out_elempack == 1) { - const __fp16* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; - for (int j = 0; j < w; j++) + quantize_pack4to1_fp16s(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); + } + } + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - *outptr0++ = float2int8((float)*ptr0++ * scale); + const __fp16* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_fp16s(ptr, s8ptr, scale_data_i, w, elempack); } } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q * 2); + const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data; + + quantize_pack4to8_fp16s(ptr0, ptr1, s8ptr, scale_data_q, w * h); + } + } + if (elempack == 4 && out_elempack == 1) { - const __fp16* ptr = bottom_blob.channel(q); - signed char* outptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; - for (int i = 0; i < size; i++) + quantize_pack4to1_fp16s(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); + } + } + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - *outptr++ = float2int8((float)*ptr++ * scale); + const __fp16* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_fp16s(ptr, s8ptr, scale_data_q, w * h, elempack); } } } @@ -423,445 +368,248 @@ int Quantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Opt return 0; } -int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void quantize_fp16sa(const __fp16* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + const int scale_data_size = scale_data.w; + const int size = elemcount * elempack; + + // NCNN_LOGE("quantize_fp16sa %d %d %d", scale_data_size, elemcount, elempack); - if (elempack == 8) + __fp16 scale = (__fp16)scale_data[0]; + float16x4_t _scale0 = vdup_n_f16(scale); + float16x4_t _scale1 = _scale0; + if (scale_data_size > 1) { - if (dims == 1) + if (elempack == 8) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { -#if defined(_MSC_VER) && !defined(__clang__) - float16x4_t _scale0 = vcvt_f16_f32(vdupq_n_f32(scale_data[0])); - float16x8_t _scale = vcombine_f16(_scale0, _scale0); -#else - float16x8_t _scale = vdupq_n_f16((__fp16)scale_data[0]); -#endif - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8; - signed char* outptr = (signed char*)top_blob + i * 8; - - float16x8_t _v = vld1q_f16(ptr0); - _v = vmulq_f16(_v, _scale); - vst1_s8(outptr, float2int8(_v)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8; - signed char* outptr = (signed char*)top_blob + i * 8; - - float16x8_t _v = vld1q_f16(ptr0); - float16x8_t _scale = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8 + 4))); - _v = vmulq_f16(_v, _scale); - vst1_s8(outptr, float2int8(_v)); - } - } + _scale0 = vcvt_f16_f32(vld1q_f32((const float*)scale_data)); + _scale1 = vcvt_f16_f32(vld1q_f32((const float*)scale_data + 4)); } - - if (dims == 2) + if (elempack == 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { -#if defined(_MSC_VER) && !defined(__clang__) - float16x4_t _scale0 = vcvt_f16_f32(vdupq_n_f32(scale_data[0])); - float16x8_t _scale = vcombine_f16(_scale0, _scale0); -#else - float16x8_t _scale = vdupq_n_f16((__fp16)scale_data[0]); -#endif - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const __fp16* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i); - - for (int j = 0; j < w; j++) - { - float16x8_t _v = vld1q_f16(ptr0); - _v = vmulq_f16(_v, _scale); - vst1_s8(outptr0, float2int8(_v)); - - ptr0 += 8; - outptr0 += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const __fp16* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i); - - float16x8_t _scale = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8 + 4))); - - for (int j = 0; j < w; j++) - { - float16x8_t _v = vld1q_f16(ptr0); - _v = vmulq_f16(_v, _scale); - vst1_s8(outptr0, float2int8(_v)); - - ptr0 += 8; - outptr0 += 8; - } - } - } + _scale0 = vcvt_f16_f32(vld1q_f32((const float*)scale_data)); + _scale1 = _scale0; } + } + float16x8_t _scale = vcombine_f16(_scale0, _scale1); - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; + int i = 0; + for (; i + 7 < size; i += 8) + { + float16x8_t _v = vld1q_f16(ptr); + _v = vmulq_f16(_v, _scale); + vst1_s8(s8ptr, float2int8(_v)); + ptr += 8; + s8ptr += 8; + } + for (; i + 3 < size; i += 4) + { + float16x4_t _v = vld1_f16(ptr); + _v = vmul_f16(_v, _scale0); + int8x8_t v = float2int8(vcombine_f16(_v, _v)); + s8ptr[0] = vget_lane_s8(v, 0); + s8ptr[1] = vget_lane_s8(v, 1); + s8ptr[2] = vget_lane_s8(v, 2); + s8ptr[3] = vget_lane_s8(v, 3); + ptr += 4; + s8ptr += 4; + } + for (; i < size; i++) + { + __fp16 v = *ptr * scale; + *s8ptr = float2int8(v); + ptr++; + s8ptr++; + } +} - top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; +static void quantize_pack4to1_fp16sa(const __fp16* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; - if (scale_data_size == 1) - { -#if defined(_MSC_VER) && !defined(__clang__) - float16x4_t _scale0 = vcvt_f16_f32(vdupq_n_f32(scale_data[0])); - float16x8_t _scale = vcombine_f16(_scale0, _scale0); -#else - float16x8_t _scale = vdupq_n_f16((__fp16)scale_data[0]); -#endif - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q); - - for (int i = 0; i < size; i++) - { - float16x8_t _v = vld1q_f16(ptr0); - _v = vmulq_f16(_v, _scale); - vst1_s8(outptr0, float2int8(_v)); - - ptr0 += 8; - outptr0 += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q); - - float16x8_t _scale = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)scale_data + q * 8)), vcvt_f16_f32(vld1q_f32((const float*)scale_data + q * 8 + 4))); - - for (int i = 0; i < size; i++) - { - float16x8_t _v = vld1q_f16(ptr0); - _v = vmulq_f16(_v, _scale); - vst1_s8(outptr0, float2int8(_v)); - - ptr0 += 8; - outptr0 += 8; - } - } - } - } + // NCNN_LOGE("quantize_pack4to1_fp16sa %d %d", scale_data_size, elemcount); - return 0; + __fp16 scale = (__fp16)scale_data[0]; + float16x4_t _scale = vdup_n_f16(scale); + if (scale_data_size > 1) + { + _scale = vcvt_f16_f32(vld1q_f32((const float*)scale_data)); } + float16x8_t _scale01 = vcombine_f16(_scale, _scale); - if (elempack == 4) + int i = 0; + for (; i + 7 < elemcount; i += 8) { - if (dims == 1) - { - int w = bottom_blob.w; - int outw = w * elempack; - - top_blob.create(outw, (size_t)1u, opt.blob_allocator); - if (top_blob.empty()) - return -100; + float16x8_t _v01 = vld1q_f16(ptr); + float16x8_t _v23 = vld1q_f16(ptr + 8); + float16x8_t _v45 = vld1q_f16(ptr + 16); + float16x8_t _v67 = vld1q_f16(ptr + 24); + _v01 = vmulq_f16(_v01, _scale01); + _v23 = vmulq_f16(_v23, _scale01); + _v45 = vmulq_f16(_v45, _scale01); + _v67 = vmulq_f16(_v67, _scale01); + int8x8_t v0 = float2int8(_v01); + int8x8_t v1 = float2int8(_v23); + int8x8_t v2 = float2int8(_v45); + int8x8_t v3 = float2int8(_v67); + int8x16_t v01 = vcombine_s8(v0, v1); + int8x16_t v23 = vcombine_s8(v2, v3); + int8x16x2_t v0213 = vuzpq_s8(v01, v23); + int8x16x2_t v0123 = vuzpq_s8(v0213.val[0], v0213.val[1]); + vst1_s8(s8ptr0, vget_low_s8(v0123.val[0])); + vst1_s8(s8ptr1, vget_high_s8(v0123.val[0])); + vst1_s8(s8ptr2, vget_low_s8(v0123.val[1])); + vst1_s8(s8ptr3, vget_high_s8(v0123.val[1])); + ptr += 32; + s8ptr0 += 8; + s8ptr1 += 8; + s8ptr2 += 8; + s8ptr3 += 8; + } + for (; i < elemcount; i++) + { + float16x4_t _v = vld1_f16(ptr); + _v = vmul_f16(_v, _scale); + int8x8_t v = float2int8(vcombine_f16(_v, _v)); + s8ptr0[0] = vget_lane_s8(v, 0); + s8ptr1[0] = vget_lane_s8(v, 1); + s8ptr2[0] = vget_lane_s8(v, 2); + s8ptr3[0] = vget_lane_s8(v, 3); + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; + } +} - if (scale_data_size == 1) - { - const __fp16 scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8(ptr0[0] * scale); - outptr[1] = float2int8(ptr0[1] * scale); - outptr[2] = float2int8(ptr0[2] * scale); - outptr[3] = float2int8(ptr0[3] * scale); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8(ptr0[0] * (__fp16)scale_data[i * 4]); - outptr[1] = float2int8(ptr0[1] * (__fp16)scale_data[i * 4 + 1]); - outptr[2] = float2int8(ptr0[2] * (__fp16)scale_data[i * 4 + 2]); - outptr[3] = float2int8(ptr0[3] * (__fp16)scale_data[i * 4 + 3]); - } - } - } +int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; - if (dims == 2) + if (dims == 1) + { + int out_elempack = 1; + if (opt.use_packing_layout) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int outh = h * elempack; + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, outh, (size_t)1u, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (scale_data_size == 1) - { - const __fp16 scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const __fp16* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8(ptr0[0] * scale); - outptr1[0] = float2int8(ptr0[1] * scale); - outptr2[0] = float2int8(ptr0[2] * scale); - outptr3[0] = float2int8(ptr0[3] * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const __fp16* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - const __fp16 s0 = scale_data[i * 4]; - const __fp16 s1 = scale_data[i * 4 + 1]; - const __fp16 s2 = scale_data[i * 4 + 2]; - const __fp16 s3 = scale_data[i * 4 + 3]; - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8(ptr0[0] * s0); - outptr1[0] = float2int8(ptr0[1] * s1); - outptr2[0] = float2int8(ptr0[2] * s2); - outptr3[0] = float2int8(ptr0[3] * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (dims == 3) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = channels * elempack; + const int i = ii * wp; - top_blob.create(w, h, outc, (size_t)1u, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const __fp16* ptr = (const __fp16*)bottom_blob + i * elempack; + signed char* s8ptr = (signed char*)top_blob + i * elempack; - if (scale_data_size == 1) - { - const __fp16 scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8(ptr0[0] * scale); - outptr1[0] = float2int8(ptr0[1] * scale); - outptr2[0] = float2int8(ptr0[2] * scale); - outptr3[0] = float2int8(ptr0[3] * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - const __fp16 s0 = scale_data[q * 4]; - const __fp16 s1 = scale_data[q * 4 + 1]; - const __fp16 s2 = scale_data[q * 4 + 2]; - const __fp16 s3 = scale_data[q * 4 + 3]; - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8(ptr0[0] * s0); - outptr1[0] = float2int8(ptr0[1] * s1); - outptr2[0] = float2int8(ptr0[2] * s2); - outptr3[0] = float2int8(ptr0[3] * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } + // assert scale_data_size == 1 + + const int size = std::min(w - i, wp) * elempack; - return 0; + quantize_fp16sa(ptr, s8ptr, scale_data, size, 1); + } } - if (dims == 1) + if (dims == 2) { - int w = bottom_blob.w; + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, (size_t)1u, opt.blob_allocator); + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - const __fp16* ptr = bottom_blob; - signed char* outptr = top_blob; - - if (scale_data_size == 1) + if (elempack == 4 && out_elempack == 1) { - const __fp16 scale = scale_data[0]; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) + for (int i = 0; i < h; i++) { - outptr[i] = float2int8(ptr[i] * scale); + const __fp16* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_pack4to1_fp16sa(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); } } - else + if (elempack == out_elempack) { #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) + for (int i = 0; i < h; i++) { - outptr[i] = float2int8(ptr[i] * (__fp16)scale_data[i]); + const __fp16* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_fp16sa(ptr, s8ptr, scale_data_i, w, elempack); } } } - if (dims == 2) + if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + if (elempack == 4 && out_elempack == 1) { - const __fp16* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); - const __fp16 scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; - for (int j = 0; j < w; j++) - { - *outptr0++ = float2int8(*ptr0++ * scale); + quantize_pack4to1_fp16sa(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); } } - } - - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (elempack == out_elempack) { - const __fp16* ptr = bottom_blob.channel(q); - signed char* outptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); - const __fp16 scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; - for (int i = 0; i < size; i++) - { - *outptr++ = float2int8(*ptr++ * scale); + quantize_fp16sa(ptr, s8ptr, scale_data_q, w * h, elempack); } } } diff --git a/src/layer/loongarch/quantize_loongarch.cpp b/src/layer/loongarch/quantize_loongarch.cpp index a0dd618771d..b4d9d7a9050 100644 --- a/src/layer/loongarch/quantize_loongarch.cpp +++ b/src/layer/loongarch/quantize_loongarch.cpp @@ -29,459 +29,287 @@ Quantize_loongarch::Quantize_loongarch() #endif } -int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + const int scale_data_size = scale_data.w; + const int size = elemcount * elempack; + + // NCNN_LOGE("quantize %d %d %d", scale_data_size, elemcount, elempack); + float scale = scale_data[0]; #if __loongarch_sx - if (elempack == 4) + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + if (scale_data_size > 1) { - if (dims == 1) + if (elempack == 4) { - int w = bottom_blob.w; - int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; - int outw = w * elempack / out_elempack; + _scale = (__m128)__lsx_vld((const float*)scale_data, 0); + } + } +#endif // __loongarch_sx - top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + int i = 0; +#if __loongarch_sx + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(ptr + 32); + __m128 _v0 = (__m128)__lsx_vld(ptr, 0); + __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0); + _v0 = __lsx_vfmul_s(_v0, _scale); + _v1 = __lsx_vfmul_s(_v1, _scale); + *((int64_t*)s8ptr) = float2int8(_v0, _v1); + ptr += 8; + s8ptr += 8; + } + for (; i + 3 < size; i += 4) + { + __m128 _v = (__m128)__lsx_vld(ptr, 0); + _v = __lsx_vfmul_s(_v, _scale); + v16i8 v = (v16i8)float2int8(_v); + s8ptr[0] = v[0]; + s8ptr[1] = v[1]; + s8ptr[2] = v[2]; + s8ptr[3] = v[3]; + ptr += 4; + s8ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + float v = *ptr * scale; + *s8ptr = float2int8(v); + ptr++; + s8ptr++; + } +} - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const float* ptr0 = (const float*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8(ptr0[0] * scale); - outptr[1] = float2int8(ptr0[1] * scale); - outptr[2] = float2int8(ptr0[2] * scale); - outptr[3] = float2int8(ptr0[3] * scale); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const float* ptr0 = (const float*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]); - outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]); - outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]); - outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]); - } - } - } +#if __loongarch_sx +static void quantize_pack4to8(const float* ptr0, const float* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; - int outh = h * elempack / out_elempack; + // NCNN_LOGE("quantize_pack4to8 %d %d", scale_data_size, elemcount); - top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + float scale = scale_data[0]; + __m128 _scale0 = (__m128)__lsx_vreplfr2vr_s(scale); + __m128 _scale1 = _scale0; + if (scale_data_size > 1) + { + _scale0 = (__m128)__lsx_vld((const float*)scale_data, 0); + _scale1 = (__m128)__lsx_vld((const float*)scale_data + 4, 0); + } - if (out_elempack == 8) - { - if (scale_data_size == 1) - { - __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const float* ptr0 = bottom_blob.row(i * 2); - const float* ptr1 = bottom_blob.row(i * 2 + 1); - signed char* outptr = top_blob.row(i); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(ptr0 + 16); - __builtin_prefetch(ptr1 + 16); - __m128 _vlow = (__m128)__lsx_vld(ptr0, 0); - __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0); - _vlow = __lsx_vfmul_s(_vlow, _scale); - _vhigh = __lsx_vfmul_s(_vhigh, _scale); - *((int64_t*)outptr) = float2int8(_vlow, _vhigh); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const float* ptr0 = bottom_blob.row(i * 2); - const float* ptr1 = bottom_blob.row(i * 2 + 1); - signed char* outptr = top_blob.row(i); - - __m128 _scale0 = (__m128)__lsx_vld((const float*)scale_data + i * 8, 0); - __m128 _scale1 = (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(ptr0 + 16); - __builtin_prefetch(ptr1 + 16); - __m128 _vlow = (__m128)__lsx_vld(ptr0, 0); - __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0); - _vlow = __lsx_vfmul_s(_vlow, _scale0); - _vhigh = __lsx_vfmul_s(_vhigh, _scale1); - *((int64_t*)outptr) = float2int8(_vlow, _vhigh); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8(ptr0[0] * scale); - outptr1[0] = float2int8(ptr0[1] * scale); - outptr2[0] = float2int8(ptr0[2] * scale); - outptr3[0] = float2int8(ptr0[3] * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - const float s0 = scale_data[i * 4]; - const float s1 = scale_data[i * 4 + 1]; - const float s2 = scale_data[i * 4 + 2]; - const float s3 = scale_data[i * 4 + 3]; - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8(ptr0[0] * s0); - outptr1[0] = float2int8(ptr0[1] * s1); - outptr2[0] = float2int8(ptr0[2] * s2); - outptr3[0] = float2int8(ptr0[3] * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } - } + int i = 0; + for (; i < elemcount; i++) + { + __m128 _v0 = (__m128)__lsx_vld(ptr0, 0); + __m128 _v1 = (__m128)__lsx_vld(ptr1, 0); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + *((int64_t*)s8ptr) = float2int8(_v0, _v1); + ptr0 += 4; + ptr1 += 4; + s8ptr += 8; + } +} - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; - int outc = channels * elempack / out_elempack; - - top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == 8) - { - if (scale_data_size == 1) - { - __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const float* ptr0 = bottom_blob.channel(q * 2); - const float* ptr1 = bottom_blob.channel(q * 2 + 1); - signed char* outptr = top_blob.channel(q); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(ptr0 + 32); - __builtin_prefetch(ptr1 + 32); - __m128 _v0 = (__m128)__lsx_vld(ptr0, 0); - __m128 _v1 = (__m128)__lsx_vld(ptr0 + 4, 0); - __m128 _v2 = (__m128)__lsx_vld(ptr1, 0); - __m128 _v3 = (__m128)__lsx_vld(ptr1 + 4, 0); - _v0 = __lsx_vfmul_s(_v0, _scale); - _v1 = __lsx_vfmul_s(_v1, _scale); - _v2 = __lsx_vfmul_s(_v2, _scale); - _v3 = __lsx_vfmul_s(_v3, _scale); - *((int64_t*)outptr) = float2int8(_v0, _v2); - *((int64_t*)(outptr + 8)) = float2int8(_v1, _v3); - - ptr0 += 8; - ptr1 += 8; - outptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(ptr0 + 16); - __builtin_prefetch(ptr1 + 16); - __m128 _vlow = (__m128)__lsx_vld(ptr0, 0); - __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0); - _vlow = __lsx_vfmul_s(_vlow, _scale); - _vhigh = __lsx_vfmul_s(_vhigh, _scale); - *((int64_t*)outptr) = float2int8(_vlow, _vhigh); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const float* ptr0 = bottom_blob.channel(q * 2); - const float* ptr1 = bottom_blob.channel(q * 2 + 1); - signed char* outptr = top_blob.channel(q); - - __m128 _scale0 = (__m128)__lsx_vld((const float*)scale_data + q * 8, 0); - __m128 _scale1 = (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0); - - int i = 0; - for (; i < size; i++) - { - __builtin_prefetch(ptr0 + 16); - __builtin_prefetch(ptr1 + 16); - __m128 _vlow = (__m128)__lsx_vld(ptr0, 0); - __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0); - _vlow = __lsx_vfmul_s(_vlow, _scale0); - _vhigh = __lsx_vfmul_s(_vhigh, _scale1); - *((int64_t*)outptr) = float2int8(_vlow, _vhigh); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8(ptr0[0] * scale); - outptr1[0] = float2int8(ptr0[1] * scale); - outptr2[0] = float2int8(ptr0[2] * scale); - outptr3[0] = float2int8(ptr0[3] * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - const float s0 = scale_data[q * 4]; - const float s1 = scale_data[q * 4 + 1]; - const float s2 = scale_data[q * 4 + 2]; - const float s3 = scale_data[q * 4 + 3]; - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8(ptr0[0] * s0); - outptr1[0] = float2int8(ptr0[1] * s1); - outptr2[0] = float2int8(ptr0[2] * s2); - outptr3[0] = float2int8(ptr0[3] * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } - } +static void quantize_pack4to1(const float* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to1 %d %d", scale_data_size, elemcount); - return 0; + float scale = scale_data[0]; + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + if (scale_data_size > 1) + { + _scale = (__m128)__lsx_vld((const float*)scale_data, 0); + } + + int i = 0; + for (; i < elemcount; i++) + { + __m128 _v = (__m128)__lsx_vld(ptr, 0); + _v = __lsx_vfmul_s(_v, _scale); + v16i8 v = (v16i8)float2int8(_v); + s8ptr0[0] = v[0]; + s8ptr1[0] = v[1]; + s8ptr2[0] = v[2]; + s8ptr3[0] = v[3]; + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; } +} #endif // __loongarch_sx +int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + if (dims == 1) { - int w = bottom_blob.w; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, (size_t)1u, opt.blob_allocator); + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - const float* ptr = bottom_blob; - signed char* outptr = top_blob; + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (scale_data_size == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - const float scale = scale_data[0]; + const int i = ii * wp; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - outptr[i] = float2int8(ptr[i] * scale); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - outptr[i] = float2int8(ptr[i] * scale_data[i]); - } + const float* ptr = (const float*)bottom_blob + i * elempack; + signed char* s8ptr = (signed char*)top_blob + i * elempack; + + // assert scale_data_size == 1 + + const int size = std::min(w - i, wp) * elempack; + + quantize(ptr, s8ptr, scale_data, size, 1); } } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) +#if __loongarch_sx + if (elempack == 4 && out_elempack == 8) { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* s8ptr = top_blob.row(i); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data; - for (int j = 0; j < w; j++) + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_i, w); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - *outptr0++ = float2int8(*ptr0++ * scale); + const float* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); + } + } +#endif // __loongarch_sx + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize(ptr, s8ptr, scale_data_i, w, elempack); } } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __loongarch_sx + if (elempack == 4 && out_elempack == 8) { - const float* ptr = bottom_blob.channel(q); - signed char* outptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* s8ptr = top_blob.channel(q); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data; - int i = 0; -#if __loongarch_sx - __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); - for (; i + 15 < size; i += 16) - { - __builtin_prefetch(ptr + 64); - __m128 _v0 = (__m128)__lsx_vld(ptr, 0); - __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0); - __m128 _v2 = (__m128)__lsx_vld(ptr + 8, 0); - __m128 _v3 = (__m128)__lsx_vld(ptr + 12, 0); - _v0 = __lsx_vfmul_s(_v0, _scale); - _v1 = __lsx_vfmul_s(_v1, _scale); - _v2 = __lsx_vfmul_s(_v2, _scale); - _v3 = __lsx_vfmul_s(_v3, _scale); - *((int64_t*)outptr) = float2int8(_v0, _v1); - *((int64_t*)(outptr + 8)) = float2int8(_v2, _v3); - - ptr += 16; - outptr += 16; + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_q, w * h); } - for (; i + 7 < size; i += 8) + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - __builtin_prefetch(ptr + 32); - __m128 _v0 = (__m128)__lsx_vld(ptr, 0); - __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0); - _v0 = __lsx_vfmul_s(_v0, _scale); - _v1 = __lsx_vfmul_s(_v1, _scale); - *((int64_t*)outptr) = float2int8(_v0, _v1); - - ptr += 8; - outptr += 8; + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); } + } #endif // __loongarch_sx - for (; i < size; i++) + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - *outptr++ = float2int8(*ptr++ * scale); + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize(ptr, s8ptr, scale_data_q, w * h, elempack); } } } diff --git a/src/layer/mips/quantize_mips.cpp b/src/layer/mips/quantize_mips.cpp index 963d0908ce4..0b9ec9db62c 100644 --- a/src/layer/mips/quantize_mips.cpp +++ b/src/layer/mips/quantize_mips.cpp @@ -29,459 +29,287 @@ Quantize_mips::Quantize_mips() #endif } -int Quantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + const int scale_data_size = scale_data.w; + const int size = elemcount * elempack; + + // NCNN_LOGE("quantize %d %d %d", scale_data_size, elemcount, elempack); + float scale = scale_data[0]; #if __mips_msa - if (elempack == 4) + v4f32 _scale = (v4f32)__msa_fill_w_f32(scale); + if (scale_data_size > 1) { - if (dims == 1) + if (elempack == 4) { - int w = bottom_blob.w; - int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; - int outw = w * elempack / out_elempack; + _scale = (v4f32)__msa_ld_w((const float*)scale_data, 0); + } + } +#endif // __mips_msa - top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + int i = 0; +#if __mips_msa + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(ptr + 32); + v4f32 _v0 = (v4f32)__msa_ld_w(ptr, 0); + v4f32 _v1 = (v4f32)__msa_ld_w(ptr + 4, 0); + _v0 = __msa_fmul_w(_v0, _scale); + _v1 = __msa_fmul_w(_v1, _scale); + *((int64_t*)s8ptr) = float2int8(_v0, _v1); + ptr += 8; + s8ptr += 8; + } + for (; i + 3 < size; i += 4) + { + v4f32 _v = (v4f32)__msa_ld_w(ptr, 0); + _v = __msa_fmul_w(_v, _scale); + v16i8 v = float2int8(_v); + s8ptr[0] = v[0]; + s8ptr[1] = v[1]; + s8ptr[2] = v[2]; + s8ptr[3] = v[3]; + ptr += 4; + s8ptr += 4; + } +#endif // __mips_msa + for (; i < size; i++) + { + float v = *ptr * scale; + *s8ptr = float2int8(v); + ptr++; + s8ptr++; + } +} - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const float* ptr0 = (const float*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8(ptr0[0] * scale); - outptr[1] = float2int8(ptr0[1] * scale); - outptr[2] = float2int8(ptr0[2] * scale); - outptr[3] = float2int8(ptr0[3] * scale); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const float* ptr0 = (const float*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]); - outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]); - outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]); - outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]); - } - } - } +#if __mips_msa +static void quantize_pack4to8(const float* ptr0, const float* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; - int outh = h * elempack / out_elempack; + // NCNN_LOGE("quantize_pack4to8 %d %d", scale_data_size, elemcount); - top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + float scale = scale_data[0]; + v4f32 _scale0 = (v4f32)__msa_fill_w_f32(scale); + v4f32 _scale1 = _scale0; + if (scale_data_size > 1) + { + _scale0 = (v4f32)__msa_ld_w((const float*)scale_data, 0); + _scale1 = (v4f32)__msa_ld_w((const float*)scale_data + 4, 0); + } - if (out_elempack == 8) - { - if (scale_data_size == 1) - { - v4f32 _scale = (v4f32)__msa_fill_w_f32(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const float* ptr0 = bottom_blob.row(i * 2); - const float* ptr1 = bottom_blob.row(i * 2 + 1); - signed char* outptr = top_blob.row(i); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(ptr0 + 16); - __builtin_prefetch(ptr1 + 16); - v4f32 _vlow = (v4f32)__msa_ld_w(ptr0, 0); - v4f32 _vhigh = (v4f32)__msa_ld_w(ptr1, 0); - _vlow = __msa_fmul_w(_vlow, _scale); - _vhigh = __msa_fmul_w(_vhigh, _scale); - *((int64_t*)outptr) = float2int8(_vlow, _vhigh); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const float* ptr0 = bottom_blob.row(i * 2); - const float* ptr1 = bottom_blob.row(i * 2 + 1); - signed char* outptr = top_blob.row(i); - - v4f32 _scale0 = (v4f32)__msa_ld_w((const float*)scale_data + i * 8, 0); - v4f32 _scale1 = (v4f32)__msa_ld_w((const float*)scale_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(ptr0 + 16); - __builtin_prefetch(ptr1 + 16); - v4f32 _vlow = (v4f32)__msa_ld_w(ptr0, 0); - v4f32 _vhigh = (v4f32)__msa_ld_w(ptr1, 0); - _vlow = __msa_fmul_w(_vlow, _scale0); - _vhigh = __msa_fmul_w(_vhigh, _scale1); - *((int64_t*)outptr) = float2int8(_vlow, _vhigh); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8(ptr0[0] * scale); - outptr1[0] = float2int8(ptr0[1] * scale); - outptr2[0] = float2int8(ptr0[2] * scale); - outptr3[0] = float2int8(ptr0[3] * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - const float s0 = scale_data[i * 4]; - const float s1 = scale_data[i * 4 + 1]; - const float s2 = scale_data[i * 4 + 2]; - const float s3 = scale_data[i * 4 + 3]; - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8(ptr0[0] * s0); - outptr1[0] = float2int8(ptr0[1] * s1); - outptr2[0] = float2int8(ptr0[2] * s2); - outptr3[0] = float2int8(ptr0[3] * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } - } + int i = 0; + for (; i < elemcount; i++) + { + v4f32 _v0 = (v4f32)__msa_ld_w(ptr0, 0); + v4f32 _v1 = (v4f32)__msa_ld_w(ptr1, 0); + _v0 = __msa_fmul_w(_v0, _scale0); + _v1 = __msa_fmul_w(_v1, _scale1); + *((int64_t*)s8ptr) = float2int8(_v0, _v1); + ptr0 += 4; + ptr1 += 4; + s8ptr += 8; + } +} - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; - int outc = channels * elempack / out_elempack; - - top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == 8) - { - if (scale_data_size == 1) - { - v4f32 _scale = (v4f32)__msa_fill_w_f32(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const float* ptr0 = bottom_blob.channel(q * 2); - const float* ptr1 = bottom_blob.channel(q * 2 + 1); - signed char* outptr = top_blob.channel(q); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(ptr0 + 32); - __builtin_prefetch(ptr1 + 32); - v4f32 _v0 = (v4f32)__msa_ld_w(ptr0, 0); - v4f32 _v1 = (v4f32)__msa_ld_w(ptr0 + 4, 0); - v4f32 _v2 = (v4f32)__msa_ld_w(ptr1, 0); - v4f32 _v3 = (v4f32)__msa_ld_w(ptr1 + 4, 0); - _v0 = __msa_fmul_w(_v0, _scale); - _v1 = __msa_fmul_w(_v1, _scale); - _v2 = __msa_fmul_w(_v2, _scale); - _v3 = __msa_fmul_w(_v3, _scale); - *((int64_t*)outptr) = float2int8(_v0, _v2); - *((int64_t*)(outptr + 8)) = float2int8(_v1, _v3); - - ptr0 += 8; - ptr1 += 8; - outptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(ptr0 + 16); - __builtin_prefetch(ptr1 + 16); - v4f32 _vlow = (v4f32)__msa_ld_w(ptr0, 0); - v4f32 _vhigh = (v4f32)__msa_ld_w(ptr1, 0); - _vlow = __msa_fmul_w(_vlow, _scale); - _vhigh = __msa_fmul_w(_vhigh, _scale); - *((int64_t*)outptr) = float2int8(_vlow, _vhigh); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const float* ptr0 = bottom_blob.channel(q * 2); - const float* ptr1 = bottom_blob.channel(q * 2 + 1); - signed char* outptr = top_blob.channel(q); - - v4f32 _scale0 = (v4f32)__msa_ld_w((const float*)scale_data + q * 8, 0); - v4f32 _scale1 = (v4f32)__msa_ld_w((const float*)scale_data + q * 8 + 4, 0); - - int i = 0; - for (; i < size; i++) - { - __builtin_prefetch(ptr0 + 16); - __builtin_prefetch(ptr1 + 16); - v4f32 _vlow = (v4f32)__msa_ld_w(ptr0, 0); - v4f32 _vhigh = (v4f32)__msa_ld_w(ptr1, 0); - _vlow = __msa_fmul_w(_vlow, _scale0); - _vhigh = __msa_fmul_w(_vhigh, _scale1); - *((int64_t*)outptr) = float2int8(_vlow, _vhigh); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8(ptr0[0] * scale); - outptr1[0] = float2int8(ptr0[1] * scale); - outptr2[0] = float2int8(ptr0[2] * scale); - outptr3[0] = float2int8(ptr0[3] * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - const float s0 = scale_data[q * 4]; - const float s1 = scale_data[q * 4 + 1]; - const float s2 = scale_data[q * 4 + 2]; - const float s3 = scale_data[q * 4 + 3]; - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8(ptr0[0] * s0); - outptr1[0] = float2int8(ptr0[1] * s1); - outptr2[0] = float2int8(ptr0[2] * s2); - outptr3[0] = float2int8(ptr0[3] * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } - } +static void quantize_pack4to1(const float* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to1 %d %d", scale_data_size, elemcount); - return 0; + float scale = scale_data[0]; + v4f32 _scale = (v4f32)__msa_fill_w_f32(scale); + if (scale_data_size > 1) + { + _scale = (v4f32)__msa_ld_w((const float*)scale_data, 0); + } + + int i = 0; + for (; i < elemcount; i++) + { + v4f32 _v = (v4f32)__msa_ld_w(ptr, 0); + _v = __msa_fmul_w(_v, _scale); + v16i8 v = float2int8(_v); + s8ptr0[0] = v[0]; + s8ptr1[0] = v[1]; + s8ptr2[0] = v[2]; + s8ptr3[0] = v[3]; + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; } +} #endif // __mips_msa +int Quantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + if (dims == 1) { - int w = bottom_blob.w; + int out_elempack = 1; +#if __mips_msa + if (opt.use_packing_layout) + { + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, (size_t)1u, opt.blob_allocator); + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - const float* ptr = bottom_blob; - signed char* outptr = top_blob; + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (scale_data_size == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - const float scale = scale_data[0]; + const int i = ii * wp; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - outptr[i] = float2int8(ptr[i] * scale); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - outptr[i] = float2int8(ptr[i] * scale_data[i]); - } + const float* ptr = (const float*)bottom_blob + i * elempack; + signed char* s8ptr = (signed char*)top_blob + i * elempack; + + // assert scale_data_size == 1 + + const int size = std::min(w - i, wp) * elempack; + + quantize(ptr, s8ptr, scale_data, size, 1); } } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; + int out_elempack = 1; +#if __mips_msa + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) +#if __mips_msa + if (elempack == 4 && out_elempack == 8) { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* s8ptr = top_blob.row(i); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data; - for (int j = 0; j < w; j++) + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_i, w); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - *outptr0++ = float2int8(*ptr0++ * scale); + const float* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); + } + } +#endif // __mips_msa + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize(ptr, s8ptr, scale_data_i, w, elempack); } } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; + int out_elempack = 1; +#if __mips_msa + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __mips_msa + if (elempack == 4 && out_elempack == 8) { - const float* ptr = bottom_blob.channel(q); - signed char* outptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* s8ptr = top_blob.channel(q); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data; - int i = 0; -#if __mips_msa - v4f32 _scale = (v4f32)__msa_fill_w_f32(scale); - for (; i + 15 < size; i += 16) - { - __builtin_prefetch(ptr + 64); - v4f32 _v0 = (v4f32)__msa_ld_w(ptr, 0); - v4f32 _v1 = (v4f32)__msa_ld_w(ptr + 4, 0); - v4f32 _v2 = (v4f32)__msa_ld_w(ptr + 8, 0); - v4f32 _v3 = (v4f32)__msa_ld_w(ptr + 12, 0); - _v0 = __msa_fmul_w(_v0, _scale); - _v1 = __msa_fmul_w(_v1, _scale); - _v2 = __msa_fmul_w(_v2, _scale); - _v3 = __msa_fmul_w(_v3, _scale); - *((int64_t*)outptr) = float2int8(_v0, _v1); - *((int64_t*)(outptr + 8)) = float2int8(_v2, _v3); - - ptr += 16; - outptr += 16; + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_q, w * h); } - for (; i + 7 < size; i += 8) + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - __builtin_prefetch(ptr + 32); - v4f32 _v0 = (v4f32)__msa_ld_w(ptr, 0); - v4f32 _v1 = (v4f32)__msa_ld_w(ptr + 4, 0); - _v0 = __msa_fmul_w(_v0, _scale); - _v1 = __msa_fmul_w(_v1, _scale); - *((int64_t*)outptr) = float2int8(_v0, _v1); - - ptr += 8; - outptr += 8; + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); } + } #endif // __mips_msa - for (; i < size; i++) + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - *outptr++ = float2int8(*ptr++ * scale); + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize(ptr, s8ptr, scale_data_q, w * h, elempack); } } } diff --git a/src/layer/quantize.cpp b/src/layer/quantize.cpp index a53cebdd9a0..c2770dcfb24 100644 --- a/src/layer/quantize.cpp +++ b/src/layer/quantize.cpp @@ -46,46 +46,41 @@ static inline signed char float2int8(float v) return (signed char)int32; } +static void quantize(const float* ptr, signed char* s8ptr, float scale, int size) +{ + for (int i = 0; i < size; i++) + { + *s8ptr = float2int8(*ptr * scale); + ptr++; + s8ptr++; + } +} + int Quantize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { - int dims = bottom_blob.dims; + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; if (dims == 1) { - int w = bottom_blob.w; - top_blob.create(w, (size_t)1u, opt.blob_allocator); if (top_blob.empty()) return -100; + // assert scale_data_size == 1 + const float* ptr = bottom_blob; - signed char* outptr = top_blob; + signed char* s8ptr = top_blob; - if (scale_data_size == 1) - { - const float scale = scale_data[0]; + const float scale = scale_data[0]; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - outptr[i] = float2int8(ptr[i] * scale); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - outptr[i] = float2int8(ptr[i] * scale_data[i]); - } - } + quantize(ptr, s8ptr, scale, w); } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -93,25 +88,17 @@ int Quantize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < h; i++) { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i); + const float* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - for (int j = 0; j < w; j++) - { - outptr0[j] = float2int8(ptr0[j] * scale); - } + quantize(ptr, s8ptr, scale, w); } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -120,14 +107,11 @@ int Quantize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) for (int q = 0; q < channels; q++) { const float* ptr = bottom_blob.channel(q); - signed char* outptr = top_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - for (int i = 0; i < size; i++) - { - outptr[i] = float2int8(ptr[i] * scale); - } + quantize(ptr, s8ptr, scale, w * h); } } diff --git a/src/layer/x86/quantize_x86.cpp b/src/layer/x86/quantize_x86.cpp index 8f7ee993673..6295ebead94 100644 --- a/src/layer/x86/quantize_x86.cpp +++ b/src/layer/x86/quantize_x86.cpp @@ -32,687 +32,455 @@ Quantize_x86::Quantize_x86() #endif // __SSE2__ } -int Quantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + const int scale_data_size = scale_data.w; + const int size = elemcount * elempack; + + // NCNN_LOGE("quantize %d %d %d", scale_data_size, elemcount, elempack); + float scale = scale_data[0]; #if __SSE2__ + __m128 _scale = _mm_set1_ps(scale); #if __AVX__ + __m256 _scale_avx = _mm256_set1_ps(scale); #if __AVX512F__ - if (elempack == 16) + __m512 _scale_avx512 = _mm512_set1_ps(scale); +#endif // __AVX512F__ +#endif // __AVX__ + if (scale_data_size > 1) { - Mat tmp; - convert_packing(bottom_blob, tmp, 8, opt); - - forward(tmp, top_blob, opt); - - return 0; +#if __AVX__ +#if __AVX512F__ + if (elempack == 16) + { + _scale_avx512 = _mm512_loadu_ps((const float*)scale_data); + } +#endif // __AVX512F__ + if (elempack == 8) + { + _scale_avx = _mm256_loadu_ps((const float*)scale_data); +#if __AVX512F__ + _scale_avx512 = combine8x2_ps(_scale_avx, _scale_avx); +#endif // __AVX512F__ + } +#endif // __AVX__ + if (elempack == 4) + { + _scale = _mm_loadu_ps((const float*)scale_data); +#if __AVX__ + _scale_avx = combine4x2_ps(_scale, _scale); +#if __AVX512F__ + _scale_avx512 = combine8x2_ps(_scale_avx, _scale_avx); +#endif // __AVX512F__ +#endif // __AVX__ + } } +#endif // __SSE2__ + + int i = 0; +#if __SSE2__ +#if __AVX__ + for (; i + 15 < size; i += 16) + { +#if __AVX512F__ + __m512 _v = _mm512_loadu_ps(ptr); + _v = _mm512_mul_ps(_v, _scale_avx512); + _mm_storeu_si128((__m128i*)s8ptr, float2int8_avx512(_v)); +#else // __AVX512F__ + __m256 _v0 = _mm256_loadu_ps(ptr); + __m256 _v1 = _mm256_loadu_ps(ptr + 8); + _v0 = _mm256_mul_ps(_v0, _scale_avx); + _v1 = _mm256_mul_ps(_v1, _scale_avx); + _mm_storeu_si128((__m128i*)s8ptr, float2int8_avx(_v0, _v1)); #endif // __AVX512F__ + ptr += 16; + s8ptr += 16; + } +#endif // __AVX__ + for (; i + 7 < size; i += 8) + { +#if __AVX__ + __m256 _v = _mm256_loadu_ps(ptr); + _v = _mm256_mul_ps(_v, _scale_avx); + *(int64_t*)s8ptr = float2int8_avx(_v); +#else // __AVX__ + __m128 _v0 = _mm_loadu_ps(ptr); + __m128 _v1 = _mm_loadu_ps(ptr + 4); + _v0 = _mm_mul_ps(_v0, _scale); + _v1 = _mm_mul_ps(_v1, _scale); + *(int64_t*)s8ptr = float2int8_sse(_v0, _v1); +#endif // __AVX__ + ptr += 8; + s8ptr += 8; + } + for (; i + 3 < size; i += 4) + { + __m128 _v = _mm_loadu_ps(ptr); + _v = _mm_mul_ps(_v, _scale); + int32_t v = float2int8_sse(_v); + s8ptr[0] = (v >> 0) & 0xff; + s8ptr[1] = (v >> 8) & 0xff; + s8ptr[2] = (v >> 16) & 0xff; + s8ptr[3] = (v >> 24) & 0xff; + ptr += 4; + s8ptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + float v = *ptr * scale; + *s8ptr = float2int8(v); + ptr++; + s8ptr++; + } +} - if (elempack == 8) +#if __SSE2__ +#if __AVX512F__ +static void quantize_pack16to8(const float* ptr, signed char* s8ptr0, signed char* s8ptr1, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack16to8 %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + __m512 _scale = _mm512_set1_ps(scale); + if (scale_data_size > 1) { - if (dims == 1) - { - int w = bottom_blob.w; + _scale = _mm512_loadu_ps((const float*)scale_data); + } - top_blob.create(w, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; + int i = 0; + for (; i < elemcount; i++) + { + __m512 _v = _mm512_loadu_ps(ptr); + _v = _mm512_mul_ps(_v, _scale); + __m128i v = float2int8_avx512(_v); + _mm_storel_pd((double*)s8ptr0, _mm_castsi128_pd(v)); + _mm_storeh_pd((double*)s8ptr1, _mm_castsi128_pd(v)); + ptr += 16; + s8ptr0 += 8; + s8ptr1 += 8; + } +} +#endif // __AVX512F__ - if (scale_data_size == 1) - { - __m256 _scale = _mm256_set1_ps(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const float* ptr = (const float*)bottom_blob + i * 8; - signed char* outptr = (signed char*)top_blob + i * 8; - - __m256 _v = _mm256_loadu_ps(ptr); - _v = _mm256_mul_ps(_v, _scale); - *(int64_t*)outptr = float2int8_avx(_v); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const float* ptr = (const float*)bottom_blob + i * 8; - signed char* outptr = (signed char*)top_blob + i * 8; - - __m256 _v = _mm256_loadu_ps(ptr); - __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8); - _v = _mm256_mul_ps(_v, _scale); - *(int64_t*)outptr = float2int8_avx(_v); - } - } - } +#if !__AVX__ +static void quantize_pack4to8(const float* ptr0, const float* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; + // NCNN_LOGE("quantize_pack4to8 %d %d", scale_data_size, elemcount); - top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; + float scale = scale_data[0]; + __m128 _scale0 = _mm_set1_ps(scale); + __m128 _scale1 = _scale0; + if (scale_data_size > 1) + { + _scale0 = _mm_loadu_ps((const float*)scale_data); + _scale1 = _mm_loadu_ps((const float*)scale_data + 4); + } - if (scale_data_size == 1) - { - __m256 _scale = _mm256_set1_ps(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const float* ptr = bottom_blob.row(i); - signed char* outptr = top_blob.row(i); - - int j = 0; - for (; j + 1 < w; j += 2) - { - __m256 _v0 = _mm256_loadu_ps(ptr); - __m256 _v1 = _mm256_loadu_ps(ptr + 8); - _v0 = _mm256_mul_ps(_v0, _scale); - _v1 = _mm256_mul_ps(_v1, _scale); - __m128i _v = float2int8_avx(_v0, _v1); - _mm_storeu_si128((__m128i*)outptr, _v); - - ptr += 16; - outptr += 16; - } - for (; j < w; j++) - { - __m256 _v = _mm256_loadu_ps(ptr); - _v = _mm256_mul_ps(_v, _scale); - *(int64_t*)outptr = float2int8_avx(_v); - - ptr += 8; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const float* ptr = bottom_blob.row(i); - signed char* outptr = top_blob.row(i); - - __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8); - - int j = 0; - for (; j + 1 < w; j += 2) - { - __m256 _v0 = _mm256_loadu_ps(ptr); - __m256 _v1 = _mm256_loadu_ps(ptr + 8); - _v0 = _mm256_mul_ps(_v0, _scale); - _v1 = _mm256_mul_ps(_v1, _scale); - __m128i _v = float2int8_avx(_v0, _v1); - _mm_storeu_si128((__m128i*)outptr, _v); - - ptr += 16; - outptr += 16; - } - for (; j < w; j++) - { - __m256 _v = _mm256_loadu_ps(ptr); - _v = _mm256_mul_ps(_v, _scale); - *(int64_t*)outptr = float2int8_avx(_v); - - ptr += 8; - outptr += 8; - } - } - } - } + int i = 0; + for (; i + 1 < elemcount; i += 2) + { + __m128 _v0 = _mm_loadu_ps(ptr0); + __m128 _v1 = _mm_loadu_ps(ptr1); + __m128 _v2 = _mm_loadu_ps(ptr0 + 4); + __m128 _v3 = _mm_loadu_ps(ptr1 + 4); + _v0 = _mm_mul_ps(_v0, _scale0); + _v1 = _mm_mul_ps(_v1, _scale1); + _v2 = _mm_mul_ps(_v2, _scale0); + _v3 = _mm_mul_ps(_v3, _scale1); + _mm_storeu_si128((__m128i*)s8ptr, float2int8_sse(_v0, _v1, _v2, _v3)); + ptr0 += 8; + ptr1 += 8; + s8ptr += 16; + } + for (; i < elemcount; i++) + { + __m128 _v0 = _mm_loadu_ps(ptr0); + __m128 _v1 = _mm_loadu_ps(ptr1); + _v0 = _mm_mul_ps(_v0, _scale0); + _v1 = _mm_mul_ps(_v1, _scale1); + *(int64_t*)s8ptr = float2int8_sse(_v0, _v1); + ptr0 += 4; + ptr1 += 4; + s8ptr += 8; + } +} +#endif // !__AVX__ - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; +static void quantize_pack4to1(const float* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; - top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; + // NCNN_LOGE("quantize_pack4to1 %d %d", scale_data_size, elemcount); - if (scale_data_size == 1) - { - __m256 _scale = _mm256_set1_ps(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = bottom_blob.channel(q); - signed char* outptr = top_blob.channel(q); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __m256 _v0 = _mm256_loadu_ps(ptr); - __m256 _v1 = _mm256_loadu_ps(ptr + 8); - _v0 = _mm256_mul_ps(_v0, _scale); - _v1 = _mm256_mul_ps(_v1, _scale); - __m128i _v = float2int8_avx(_v0, _v1); - _mm_storeu_si128((__m128i*)outptr, _v); - - ptr += 16; - outptr += 16; - } - for (; i < size; i++) - { - __m256 _v = _mm256_loadu_ps(ptr); - _v = _mm256_mul_ps(_v, _scale); - *(int64_t*)outptr = float2int8_avx(_v); - - ptr += 8; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = bottom_blob.channel(q); - signed char* outptr = top_blob.channel(q); - - __m256 _scale = _mm256_loadu_ps((const float*)scale_data + q * 8); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __m256 _v0 = _mm256_loadu_ps(ptr); - __m256 _v1 = _mm256_loadu_ps(ptr + 8); - _v0 = _mm256_mul_ps(_v0, _scale); - _v1 = _mm256_mul_ps(_v1, _scale); - __m128i _v = float2int8_avx(_v0, _v1); - _mm_storeu_si128((__m128i*)outptr, _v); - - ptr += 16; - outptr += 16; - } - for (; i < size; i++) - { - __m256 _v = _mm256_loadu_ps(ptr); - _v = _mm256_mul_ps(_v, _scale); - *(int64_t*)outptr = float2int8_avx(_v); - - ptr += 8; - outptr += 8; - } - } - } - } + float scale = scale_data[0]; + __m128 _scale = _mm_set1_ps(scale); + if (scale_data_size > 1) + { + _scale = _mm_loadu_ps((const float*)scale_data); + } - return 0; + int i = 0; + for (; i + 7 < elemcount; i += 8) + { + __m128 _v0 = _mm_loadu_ps(ptr); + __m128 _v1 = _mm_loadu_ps(ptr + 4); + __m128 _v2 = _mm_loadu_ps(ptr + 8); + __m128 _v3 = _mm_loadu_ps(ptr + 12); + __m128 _v4 = _mm_loadu_ps(ptr + 16); + __m128 _v5 = _mm_loadu_ps(ptr + 20); + __m128 _v6 = _mm_loadu_ps(ptr + 24); + __m128 _v7 = _mm_loadu_ps(ptr + 28); + _v0 = _mm_mul_ps(_v0, _scale); + _v1 = _mm_mul_ps(_v1, _scale); + _v2 = _mm_mul_ps(_v2, _scale); + _v3 = _mm_mul_ps(_v3, _scale); + _v4 = _mm_mul_ps(_v4, _scale); + _v5 = _mm_mul_ps(_v5, _scale); + _v6 = _mm_mul_ps(_v6, _scale); + _v7 = _mm_mul_ps(_v7, _scale); + __m128i v0426 = float2int8_sse(_v0, _v4, _v2, _v6); + __m128i v1537 = float2int8_sse(_v1, _v5, _v3, _v7); + __m128i v0145 = _mm_unpacklo_epi8(v0426, v1537); + __m128i v2367 = _mm_unpackhi_epi8(v0426, v1537); + __m128i v0123 = _mm_unpacklo_epi16(v0145, v2367); + __m128i v4567 = _mm_unpackhi_epi16(v0145, v2367); + __m128i v01 = _mm_unpacklo_epi32(v0123, v4567); + __m128i v23 = _mm_unpackhi_epi32(v0123, v4567); + _mm_storel_pd((double*)s8ptr0, _mm_castsi128_pd(v01)); + _mm_storeh_pd((double*)s8ptr1, _mm_castsi128_pd(v01)); + _mm_storel_pd((double*)s8ptr2, _mm_castsi128_pd(v23)); + _mm_storeh_pd((double*)s8ptr3, _mm_castsi128_pd(v23)); + ptr += 32; + s8ptr0 += 8; + s8ptr1 += 8; + s8ptr2 += 8; + s8ptr3 += 8; } -#endif // __AVX__ + for (; i < elemcount; i++) + { + __m128 _v = _mm_loadu_ps(ptr); + _v = _mm_mul_ps(_v, _scale); + int64_t v = float2int8_sse(_v, _v); + s8ptr0[0] = (v >> 32) & 0xff; + s8ptr1[0] = (v >> 40) & 0xff; + s8ptr2[0] = (v >> 48) & 0xff; + s8ptr3[0] = (v >> 56) & 0xff; + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; + } +} +#endif // __SSE2__ - if (elempack == 4) +int Quantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + + if (dims == 1) { - if (dims == 1) + int out_elempack = 1; +#if __SSE2__ + if (opt.use_packing_layout) { - int w = bottom_blob.w; - int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; - int outw = w * elempack / out_elempack; + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const float* ptr0 = (const float*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8(ptr0[0] * scale); - outptr[1] = float2int8(ptr0[1] * scale); - outptr[2] = float2int8(ptr0[2] * scale); - outptr[3] = float2int8(ptr0[3] * scale); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const float* ptr0 = (const float*)bottom_blob + i * 4; - signed char* outptr = (signed char*)top_blob + i * 4; - - outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]); - outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]); - outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]); - outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]); - } - } - } + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (dims == 2) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; - int outh = h * elempack / out_elempack; + const int i = ii * wp; - top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const float* ptr = (const float*)bottom_blob + i * elempack; + signed char* s8ptr = (signed char*)top_blob + i * elempack; - if (out_elempack == 8) - { - if (scale_data_size == 1) - { - __m128 _scale = _mm_set1_ps(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const float* ptr0 = bottom_blob.row(i * 2); - const float* ptr1 = bottom_blob.row(i * 2 + 1); - signed char* outptr = top_blob.row(i); - - int j = 0; - for (; j + 1 < w; j += 2) - { - __m128 _v0 = _mm_loadu_ps(ptr0); - __m128 _v1 = _mm_loadu_ps(ptr1); - __m128 _v2 = _mm_loadu_ps(ptr0 + 4); - __m128 _v3 = _mm_loadu_ps(ptr1 + 4); - _v0 = _mm_mul_ps(_v0, _scale); - _v1 = _mm_mul_ps(_v1, _scale); - _v2 = _mm_mul_ps(_v2, _scale); - _v3 = _mm_mul_ps(_v3, _scale); - __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3); - _mm_storeu_si128((__m128i*)outptr, _v); - - ptr0 += 8; - ptr1 += 8; - outptr += 16; - } - for (; j < w; j++) - { - __m128 _vlow = _mm_loadu_ps(ptr0); - __m128 _vhigh = _mm_loadu_ps(ptr1); - _vlow = _mm_mul_ps(_vlow, _scale); - _vhigh = _mm_mul_ps(_vhigh, _scale); - *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const float* ptr0 = bottom_blob.row(i * 2); - const float* ptr1 = bottom_blob.row(i * 2 + 1); - signed char* outptr = top_blob.row(i); - - __m128 _scale0 = _mm_loadu_ps((const float*)scale_data + i * 8); - __m128 _scale1 = _mm_loadu_ps((const float*)scale_data + i * 8 + 4); - - int j = 0; - for (; j + 1 < w; j += 2) - { - __m128 _v0 = _mm_loadu_ps(ptr0); - __m128 _v1 = _mm_loadu_ps(ptr1); - __m128 _v2 = _mm_loadu_ps(ptr0 + 4); - __m128 _v3 = _mm_loadu_ps(ptr1 + 4); - _v0 = _mm_mul_ps(_v0, _scale0); - _v1 = _mm_mul_ps(_v1, _scale1); - _v2 = _mm_mul_ps(_v2, _scale0); - _v3 = _mm_mul_ps(_v3, _scale1); - __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3); - _mm_storeu_si128((__m128i*)outptr, _v); - - ptr0 += 8; - ptr1 += 8; - outptr += 16; - } - for (; j < w; j++) - { - __m128 _vlow = _mm_loadu_ps(ptr0); - __m128 _vhigh = _mm_loadu_ps(ptr1); - _vlow = _mm_mul_ps(_vlow, _scale0); - _vhigh = _mm_mul_ps(_vhigh, _scale1); - *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8(ptr0[0] * scale); - outptr1[0] = float2int8(ptr0[1] * scale); - outptr2[0] = float2int8(ptr0[2] * scale); - outptr3[0] = float2int8(ptr0[3] * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i * 4); - signed char* outptr1 = top_blob.row(i * 4 + 1); - signed char* outptr2 = top_blob.row(i * 4 + 2); - signed char* outptr3 = top_blob.row(i * 4 + 3); - - const float s0 = scale_data[i * 4]; - const float s1 = scale_data[i * 4 + 1]; - const float s2 = scale_data[i * 4 + 2]; - const float s3 = scale_data[i * 4 + 3]; - - for (int j = 0; j < w; j++) - { - outptr0[0] = float2int8(ptr0[0] * s0); - outptr1[0] = float2int8(ptr0[1] * s1); - outptr2[0] = float2int8(ptr0[2] * s2); - outptr3[0] = float2int8(ptr0[3] * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } - } + // assert scale_data_size == 1 - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; - int outc = channels * elempack / out_elempack; - - top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == 8) - { - if (scale_data_size == 1) - { - __m128 _scale = _mm_set1_ps(scale_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const float* ptr0 = bottom_blob.channel(q * 2); - const float* ptr1 = bottom_blob.channel(q * 2 + 1); - signed char* outptr = top_blob.channel(q); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __m128 _v0 = _mm_loadu_ps(ptr0); - __m128 _v1 = _mm_loadu_ps(ptr1); - __m128 _v2 = _mm_loadu_ps(ptr0 + 4); - __m128 _v3 = _mm_loadu_ps(ptr1 + 4); - _v0 = _mm_mul_ps(_v0, _scale); - _v1 = _mm_mul_ps(_v1, _scale); - _v2 = _mm_mul_ps(_v2, _scale); - _v3 = _mm_mul_ps(_v3, _scale); - __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3); - _mm_storeu_si128((__m128i*)outptr, _v); - - ptr0 += 8; - ptr1 += 8; - outptr += 16; - } - for (; i < size; i++) - { - __m128 _vlow = _mm_loadu_ps(ptr0); - __m128 _vhigh = _mm_loadu_ps(ptr1); - _vlow = _mm_mul_ps(_vlow, _scale); - _vhigh = _mm_mul_ps(_vhigh, _scale); - *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const float* ptr0 = bottom_blob.channel(q * 2); - const float* ptr1 = bottom_blob.channel(q * 2 + 1); - signed char* outptr = top_blob.channel(q); - - __m128 _scale0 = _mm_loadu_ps((const float*)scale_data + q * 8); - __m128 _scale1 = _mm_loadu_ps((const float*)scale_data + q * 8 + 4); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __m128 _v0 = _mm_loadu_ps(ptr0); - __m128 _v1 = _mm_loadu_ps(ptr1); - __m128 _v2 = _mm_loadu_ps(ptr0 + 4); - __m128 _v3 = _mm_loadu_ps(ptr1 + 4); - _v0 = _mm_mul_ps(_v0, _scale0); - _v1 = _mm_mul_ps(_v1, _scale1); - _v2 = _mm_mul_ps(_v2, _scale0); - _v3 = _mm_mul_ps(_v3, _scale1); - __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3); - _mm_storeu_si128((__m128i*)outptr, _v); - - ptr0 += 8; - ptr1 += 8; - outptr += 16; - } - for (; i < size; i++) - { - __m128 _vlow = _mm_loadu_ps(ptr0); - __m128 _vhigh = _mm_loadu_ps(ptr1); - _vlow = _mm_mul_ps(_vlow, _scale0); - _vhigh = _mm_mul_ps(_vhigh, _scale1); - *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh); - - ptr0 += 4; - ptr1 += 4; - outptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8(ptr0[0] * scale); - outptr1[0] = float2int8(ptr0[1] * scale); - outptr2[0] = float2int8(ptr0[2] * scale); - outptr3[0] = float2int8(ptr0[3] * scale); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr0 = bottom_blob.channel(q); - signed char* outptr0 = top_blob.channel(q * 4); - signed char* outptr1 = top_blob.channel(q * 4 + 1); - signed char* outptr2 = top_blob.channel(q * 4 + 2); - signed char* outptr3 = top_blob.channel(q * 4 + 3); - - const float s0 = scale_data[q * 4]; - const float s1 = scale_data[q * 4 + 1]; - const float s2 = scale_data[q * 4 + 2]; - const float s3 = scale_data[q * 4 + 3]; - - for (int i = 0; i < size; i++) - { - outptr0[0] = float2int8(ptr0[0] * s0); - outptr1[0] = float2int8(ptr0[1] * s1); - outptr2[0] = float2int8(ptr0[2] * s2); - outptr3[0] = float2int8(ptr0[3] * s3); - - ptr0 += 4; - outptr0 += 1; - outptr1 += 1; - outptr2 += 1; - outptr3 += 1; - } - } - } - } - } + const int size = std::min(w - i, wp) * elempack; - return 0; + quantize(ptr, s8ptr, scale_data, size, 1); + } } -#endif // __SSE2__ - if (dims == 1) + if (dims == 2) { - int w = bottom_blob.w; + int out_elempack = 1; +#if __SSE2__ + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, (size_t)1u, opt.blob_allocator); + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - const float* ptr = bottom_blob; - signed char* outptr = top_blob; +#if __SSE2__ +#if __AVX512F__ + if (elempack == 16 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 2); + signed char* s8ptr1 = top_blob.row(i * 2 + 1); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; - if (scale_data_size == 1) + quantize_pack16to8(ptr, s8ptr0, s8ptr1, scale_data_i, w); + } + } +#endif // __AVX512F__ +#if !__AVX__ + if (elempack == 4 && out_elempack == 8) { - const float scale = scale_data[0]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* s8ptr = top_blob.row(i); + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data; + + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_i, w); + } + } +#endif // !__AVX__ + if (elempack == 4 && out_elempack == 1) + { #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) + for (int i = 0; i < h; i++) { - outptr[i] = float2int8(ptr[i] * scale); + const float* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); } } - else +#endif // __SSE2__ + if (elempack == out_elempack) { #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) + for (int i = 0; i < h; i++) { - outptr[i] = float2int8(ptr[i] * scale_data[i]); + const float* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize(ptr, s8ptr, scale_data_i, w, elempack); } } } - if (dims == 2) + if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; + int out_elempack = 1; +#if __SSE2__ + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) +#if __SSE2__ +#if __AVX512F__ + if (elempack == 16 && out_elempack == 8) { - const float* ptr0 = bottom_blob.row(i); - signed char* outptr0 = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 2); + signed char* s8ptr1 = top_blob.channel(q * 2 + 1); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; - for (int j = 0; j < w; j++) - { - *outptr0++ = float2int8(*ptr0++ * scale); + quantize_pack16to8(ptr, s8ptr0, s8ptr1, scale_data_q, w * h); } } - } - - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; +#endif // __AVX512F__ +#if !__AVX__ + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* s8ptr = top_blob.channel(q); - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_q, w * h); + } + } +#endif // !__AVX__ + if (elempack == 4 && out_elempack == 1) { - const float* ptr = bottom_blob.channel(q); - signed char* outptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; - for (int i = 0; i < size; i++) + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); + } + } +#endif // __SSE2__ + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - *outptr++ = float2int8(*ptr++ * scale); + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize(ptr, s8ptr, scale_data_q, w * h, elempack); } } } diff --git a/src/layer/x86/requantize_x86.cpp b/src/layer/x86/requantize_x86.cpp index 996681e5e42..6b64f86967d 100644 --- a/src/layer/x86/requantize_x86.cpp +++ b/src/layer/x86/requantize_x86.cpp @@ -44,16 +44,17 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ float scale_in = scale_in_data[0]; #if __SSE2__ - __m128 _scale_in = _mm_set1_ps(scale_in); + __m128 _scale_in0 = _mm_set1_ps(scale_in); #if __AVX__ __m256 _scale_in_avx = _mm256_set1_ps(scale_in); #if __AVX512F__ __m512 _scale_in_avx512 = _mm512_set1_ps(scale_in); #endif // __AVX512F__ +#else // __AVX__ + __m128 _scale_in1 = _scale_in0; #endif // __AVX__ if (scale_in_data_size > 1) { -#if __AVX__ #if __AVX512F__ if (elempack == 16) { @@ -62,20 +63,26 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ #endif // __AVX512F__ if (elempack == 8) { +#if __AVX__ _scale_in_avx = _mm256_loadu_ps((const float*)scale_in_data); #if __AVX512F__ _scale_in_avx512 = combine8x2_ps(_scale_in_avx, _scale_in_avx); #endif // __AVX512F__ - } +#else // __AVX__ + _scale_in0 = _mm_loadu_ps((const float*)scale_in_data); + _scale_in1 = _mm_loadu_ps((const float*)scale_in_data + 4); #endif // __AVX__ + } if (elempack == 4) { - _scale_in = _mm_loadu_ps((const float*)scale_in_data); + _scale_in0 = _mm_loadu_ps((const float*)scale_in_data); #if __AVX__ - _scale_in_avx = combine4x2_ps(_scale_in, _scale_in); + _scale_in_avx = combine4x2_ps(_scale_in0, _scale_in0); #if __AVX512F__ _scale_in_avx512 = combine8x2_ps(_scale_in_avx, _scale_in_avx); #endif // __AVX512F__ +#else // __AVX__ + _scale_in1 = _scale_in0; #endif // __AVX__ } } @@ -83,16 +90,17 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ float scale_out = scale_out_data[0]; #if __SSE2__ - __m128 _scale_out = _mm_set1_ps(scale_out); + __m128 _scale_out0 = _mm_set1_ps(scale_out); #if __AVX__ __m256 _scale_out_avx = _mm256_set1_ps(scale_out); #if __AVX512F__ __m512 _scale_out_avx512 = _mm512_set1_ps(scale_out); #endif // __AVX512F__ +#else // __AVX__ + __m128 _scale_out1 = _scale_out0; #endif // __AVX__ if (scale_out_data_size > 1) { -#if __AVX__ #if __AVX512F__ if (elempack == 16) { @@ -101,20 +109,26 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ #endif // __AVX512F__ if (elempack == 8) { +#if __AVX__ _scale_out_avx = _mm256_loadu_ps((const float*)scale_out_data); #if __AVX512F__ _scale_out_avx512 = combine8x2_ps(_scale_out_avx, _scale_out_avx); #endif // __AVX512F__ - } +#else // __AVX__ + _scale_out0 = _mm_loadu_ps((const float*)scale_out_data); + _scale_out1 = _mm_loadu_ps((const float*)scale_out_data + 4); #endif // __AVX__ + } if (elempack == 4) { - _scale_out = _mm_loadu_ps((const float*)scale_out_data); + _scale_out0 = _mm_loadu_ps((const float*)scale_out_data); #if __AVX__ - _scale_out_avx = combine4x2_ps(_scale_out, _scale_out); + _scale_out_avx = combine4x2_ps(_scale_out0, _scale_out0); #if __AVX512F__ _scale_out_avx512 = combine8x2_ps(_scale_out_avx, _scale_out_avx); #endif // __AVX512F__ +#else // __AVX__ + _scale_out1 = _scale_out0; #endif // __AVX__ } } @@ -159,12 +173,12 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ #else // __AVX__ __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_mul_ps(_v0, _scale_in); - _v1 = _mm_mul_ps(_v1, _scale_in); + _v0 = _mm_mul_ps(_v0, _scale_in0); + _v1 = _mm_mul_ps(_v1, _scale_in1); _v0 = activation_sse(_v0, activation_type, activation_params); _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out); - _v1 = _mm_mul_ps(_v1, _scale_out); + _v0 = _mm_mul_ps(_v0, _scale_out0); + _v1 = _mm_mul_ps(_v1, _scale_out1); *(int64_t*)ptr = float2int8_sse(_v0, _v1); #endif // __AVX__ intptr += 8; @@ -173,9 +187,9 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ for (; i + 3 < size; i += 4) { __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale_in); + _v = _mm_mul_ps(_v, _scale_in0); _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); + _v = _mm_mul_ps(_v, _scale_out0); int32_t v = float2int8_sse(_v); ptr[0] = (v >> 0) & 0xff; ptr[1] = (v >> 8) & 0xff; @@ -198,16 +212,17 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ { float bias = bias_data[0]; #if __SSE2__ - __m128 _bias = _mm_set1_ps(bias); + __m128 _bias0 = _mm_set1_ps(bias); #if __AVX__ __m256 _bias_avx = _mm256_set1_ps(bias); #if __AVX512F__ __m512 _bias_avx512 = _mm512_set1_ps(bias); #endif // __AVX512F__ +#else // __AVX__ + __m128 _bias1 = _bias0; #endif // __AVX__ if (bias_data_size > 1) { -#if __AVX__ #if __AVX512F__ if (elempack == 16) { @@ -216,20 +231,26 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ #endif // __AVX512F__ if (elempack == 8) { +#if __AVX__ _bias_avx = _mm256_loadu_ps((const float*)bias_data); #if __AVX512F__ _bias_avx512 = combine8x2_ps(_bias_avx, _bias_avx); #endif // __AVX512F__ - } +#else // __AVX__ + _bias0 = _mm_loadu_ps((const float*)bias_data); + _bias1 = _mm_loadu_ps((const float*)bias_data + 4); #endif // __AVX__ + } if (elempack == 4) { - _bias = _mm_loadu_ps((const float*)bias_data); + _bias0 = _mm_loadu_ps((const float*)bias_data); #if __AVX__ - _bias_avx = combine4x2_ps(_bias, _bias); + _bias_avx = combine4x2_ps(_bias0, _bias0); #if __AVX512F__ _bias_avx512 = combine8x2_ps(_bias_avx, _bias_avx); #endif // __AVX512F__ +#else // __AVX__ + _bias1 = _bias0; #endif // __AVX__ } } @@ -272,12 +293,12 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ #else // __AVX__ __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_comp_fmadd_ps(_v0, _scale_in, _bias); - _v1 = _mm_comp_fmadd_ps(_v1, _scale_in, _bias); + _v0 = _mm_comp_fmadd_ps(_v0, _scale_in0, _bias0); + _v1 = _mm_comp_fmadd_ps(_v1, _scale_in1, _bias1); _v0 = activation_sse(_v0, activation_type, activation_params); _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out); - _v1 = _mm_mul_ps(_v1, _scale_out); + _v0 = _mm_mul_ps(_v0, _scale_out0); + _v1 = _mm_mul_ps(_v1, _scale_out1); *(int64_t*)ptr = float2int8_sse(_v0, _v1); #endif // __AVX__ intptr += 8; @@ -286,9 +307,9 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ for (; i + 3 < size; i += 4) { __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_comp_fmadd_ps(_v, _scale_in, _bias); + _v = _mm_comp_fmadd_ps(_v, _scale_in0, _bias0); _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); + _v = _mm_mul_ps(_v, _scale_out0); int32_t v = float2int8_sse(_v); ptr[0] = (v >> 0) & 0xff; ptr[1] = (v >> 8) & 0xff; diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index 4a9d2f3739a..0398be80d4a 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -727,6 +727,7 @@ static NCNN_FORCEINLINE int64_t float2int8_avx(const __m256& _v0) __m256i _v0_i = _mm256_cvttps_epi32(_v0_adj); #if __AVX512F__ __m128i _v8 = _mm256_cvtsepi32_epi8(_v0_i); + _v8 = _mm_max_epi8(_v8, _mm_set1_epi8(-127)); #else // __AVX512F__ #if __AVX2__ __m256i _v01_s16 = _mm256_packs_epi32(_v0_i, _v0_i); @@ -1457,7 +1458,9 @@ static NCNN_FORCEINLINE __m128i float2int8_avx512(const __m512& _v0) __m512 _v0_p5 = _mm512_or_ps(_p5, _sign); __m512 _v0_adj = _mm512_add_ps(_v0, _v0_p5); __m512i _v0_i = _mm512_cvttps_epi32(_v0_adj); - return _mm512_cvtsepi32_epi8(_v0_i); + __m128i _v8 = _mm512_cvtsepi32_epi8(_v0_i); + _v8 = _mm_max_epi8(_v8, _mm_set1_epi8(-127)); + return _v8; } static NCNN_FORCEINLINE __m512 bfloat2float_avx512(const __m256i& v0) diff --git a/tests/test_quantize_oom.cpp b/tests/test_quantize_oom.cpp new file mode 100644 index 00000000000..ca78535ed97 --- /dev/null +++ b/tests/test_quantize_oom.cpp @@ -0,0 +1,80 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "testutil.h" + +static int test_quantize_oom(const ncnn::Mat& a, float scale_low, float scale_high) +{ + ncnn::Mat scale_data; + if (scale_low == scale_high) + { + scale_data.create(1); + scale_data[0] = scale_low; + } + else + { + if (a.dims == 1) scale_data.create(a.w); + if (a.dims == 2) scale_data.create(a.h); + if (a.dims == 3) scale_data.create(a.c); + Randomize(scale_data, scale_low, scale_high); + } + + ncnn::ParamDict pd; + pd.set(0, scale_data.w); + + std::vector weights(1); + weights[0] = scale_data; + + int ret = test_layer_oom("Quantize", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_quantize_oom failed a.dims=%d a=(%d %d %d) scale_low=%f scale_high=%f\n", a.dims, a.w, a.h, a.c, scale_low, scale_high); + } + + return ret; +} + +static int test_quantize_0() +{ + return 0 + || test_quantize_oom(RandomMat(5, 7, 24), 100.f, 100.f) + || test_quantize_oom(RandomMat(7, 9, 12), 100.f, 100.f) + || test_quantize_oom(RandomMat(3, 5, 13), 100.f, 100.f); +} + +static int test_quantize_1() +{ + return 0 + || test_quantize_oom(RandomMat(15, 24), 100.f, 100.f) + || test_quantize_oom(RandomMat(17, 12), 100.f, 100.f) + || test_quantize_oom(RandomMat(19, 15), 100.f, 100.f); +} + +static int test_quantize_2() +{ + return 0 + || test_quantize_oom(RandomMat(128), 120.f, 140.f) + || test_quantize_oom(RandomMat(124), 120.f, 140.f) + || test_quantize_oom(RandomMat(127), 120.f, 140.f); +} + +int main() +{ + SRAND(7767517); + + return 0 + || test_quantize_0() + || test_quantize_1() + || test_quantize_2(); +}