From 49b27db179d971c8136eb302f708078aed3f265c Mon Sep 17 00:00:00 2001 From: nihuini Date: Wed, 5 Feb 2025 16:12:09 +0800 Subject: [PATCH] w --- src/layer/arm/quantize_arm.cpp | 524 +++++++++++++++++++++++-- src/layer/arm/quantize_arm_asimdhp.cpp | 407 +++++++++++++++++-- 2 files changed, 867 insertions(+), 64 deletions(-) diff --git a/src/layer/arm/quantize_arm.cpp b/src/layer/arm/quantize_arm.cpp index 46b44d104c3..18a07f0a324 100644 --- a/src/layer/arm/quantize_arm.cpp +++ b/src/layer/arm/quantize_arm.cpp @@ -60,6 +60,20 @@ static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data int i = 0; #if __ARM_NEON + for (; i + 15 < size; i += 16) + { + float32x4_t _v0 = vld1q_f32(ptr); + float32x4_t _v1 = vld1q_f32(ptr + 4); + float32x4_t _v2 = vld1q_f32(ptr + 8); + float32x4_t _v3 = vld1q_f32(ptr + 12); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + _v2 = vmulq_f32(_v2, _scale); + _v3 = vmulq_f32(_v3, _scale); + vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3))); + ptr += 16; + s8ptr += 16; + } for (; i + 7 < size; i += 8) { float32x4_t _v0 = vld1q_f32(ptr); @@ -92,6 +106,119 @@ static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data } } +#if __ARM_NEON +static void quantize_pack4to8(const float* ptr0, const float* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to8 %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + float32x4_t _scale0 = vdupq_n_f32(scale); + float32x4_t _scale1 = _scale0; + if (scale_data_size > 1) + { + _scale0 = vld1q_f32((const float*)scale_data); + _scale1 = vld1q_f32((const float*)scale_data + 4); + } + + int i = 0; + for (; i + 1 < elemcount; i += 2) + { + float32x4_t _v0 = vld1q_f32(ptr0); + float32x4_t _v1 = vld1q_f32(ptr1); + float32x4_t _v2 = vld1q_f32(ptr0 + 4); + float32x4_t _v3 = vld1q_f32(ptr1 + 4); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + _v2 = vmulq_f32(_v2, _scale0); + _v3 = vmulq_f32(_v3, _scale1); + vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3))); + ptr0 += 8; + ptr1 += 8; + s8ptr += 16; + } + for (; i < elemcount; i++) + { + float32x4_t _v0 = vld1q_f32(ptr0); + float32x4_t _v1 = vld1q_f32(ptr1); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + vst1_s8(s8ptr, float2int8(_v0, _v1)); + ptr0 += 4; + ptr1 += 4; + s8ptr += 8; + } +} + +static void quantize_pack4to1(const float* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to1 %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + float32x4_t _scale = vdupq_n_f32(scale); + if (scale_data_size > 1) + { + _scale = vld1q_f32((const float*)scale_data); + } + + int i = 0; + for (; i + 7 < elemcount; i += 8) + { + float32x4_t _v0 = vld1q_f32(ptr); + float32x4_t _v1 = vld1q_f32(ptr + 4); + float32x4_t _v2 = vld1q_f32(ptr + 8); + float32x4_t _v3 = vld1q_f32(ptr + 12); + float32x4_t _v4 = vld1q_f32(ptr + 16); + float32x4_t _v5 = vld1q_f32(ptr + 20); + float32x4_t _v6 = vld1q_f32(ptr + 24); + float32x4_t _v7 = vld1q_f32(ptr + 28); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + _v2 = vmulq_f32(_v2, _scale); + _v3 = vmulq_f32(_v3, _scale); + _v4 = vmulq_f32(_v4, _scale); + _v5 = vmulq_f32(_v5, _scale); + _v6 = vmulq_f32(_v6, _scale); + _v7 = vmulq_f32(_v7, _scale); + int8x8_t v0 = float2int8(_v0, _v1); + int8x8_t v1 = float2int8(_v2, _v3); + int8x8_t v2 = float2int8(_v4, _v5); + int8x8_t v3 = float2int8(_v6, _v7); + int8x16_t v01 = vcombine_s8(v0, v1); + int8x16_t v23 = vcombine_s8(v2, v3); + int8x16x2_t v0213 = vuzpq_s8(v01, v23); + int8x16x2_t v0123 = vuzpq_s8(v0213.val[0], v0213.val[1]); + vst1_s8(s8ptr0, vget_low_s8(v0123.val[0])); + vst1_s8(s8ptr1, vget_high_s8(v0123.val[0])); + vst1_s8(s8ptr2, vget_low_s8(v0123.val[1])); + vst1_s8(s8ptr3, vget_high_s8(v0123.val[1])); + ptr += 32; + s8ptr0 += 8; + s8ptr1 += 8; + s8ptr2 += 8; + s8ptr3 += 8; + } + for (; i < elemcount; i++) + { + float32x4_t _v = vld1q_f32(ptr); + _v = vmulq_f32(_v, _scale); + int8x8_t v = float2int8(_v, _v); + s8ptr0[0] = vget_lane_s8(v, 0); + s8ptr1[0] = vget_lane_s8(v, 1); + s8ptr2[0] = vget_lane_s8(v, 2); + s8ptr3[0] = vget_lane_s8(v, 3); + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; + } +} +#endif // __ARM_NEON + int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { int elembits = bottom_blob.elembits(); @@ -116,11 +243,20 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o const int h = bottom_blob.h; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; - const size_t out_elemsize = elempack * 1u; if (dims == 1) { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -145,37 +281,127 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o if (dims == 2) { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) +#if __ARM_NEON + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data; + + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_i, w); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); + } + } +#endif // __ARM_NEON + if (elempack == out_elempack) { - const float* ptr = bottom_blob.row(i); - signed char* s8ptr = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); - const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; - quantize(ptr, s8ptr, scale_data_i, w, elempack); + quantize(ptr, s8ptr, scale_data_i, w, elempack); + } } } if (dims == 3) { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __ARM_NEON + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data; + + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_q, w * h); + } + } + if (elempack == 4 && out_elempack == 1) { - const float* ptr = bottom_blob.channel(q); - signed char* s8ptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); + } + } +#endif // __ARM_NEON + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); - const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; - quantize(ptr, s8ptr, scale_data_q, w * h, elempack); + quantize(ptr, s8ptr, scale_data_q, w * h, elempack); + } } } @@ -204,6 +430,22 @@ static void quantize_bf16s(const unsigned short* ptr, signed char* s8ptr, const int i = 0; #if __ARM_NEON + for (; i + 15 < size; i += 16) + { + uint16x8_t _v01 = vld1q_u16(ptr); + uint16x8_t _v23 = vld1q_u16(ptr + 8); + float32x4_t _v0 = bfloat2float(vget_low_u16(_v01)); + float32x4_t _v1 = bfloat2float(vget_high_u16(_v01)); + float32x4_t _v2 = bfloat2float(vget_low_u16(_v23)); + float32x4_t _v3 = bfloat2float(vget_high_u16(_v23)); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + _v2 = vmulq_f32(_v2, _scale); + _v3 = vmulq_f32(_v3, _scale); + vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3))); + ptr += 16; + s8ptr += 16; + } for (; i + 7 < size; i += 8) { uint16x8_t _v01 = vld1q_u16(ptr); @@ -237,6 +479,125 @@ static void quantize_bf16s(const unsigned short* ptr, signed char* s8ptr, const } } +#if __ARM_NEON +static void quantize_pack4to8_bf16s(const unsigned short* ptr0, const unsigned short* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to8_bf16s %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + float32x4_t _scale0 = vdupq_n_f32(scale); + float32x4_t _scale1 = _scale0; + if (scale_data_size > 1) + { + _scale0 = vld1q_f32((const float*)scale_data); + _scale1 = vld1q_f32((const float*)scale_data + 4); + } + + int i = 0; + for (; i + 1 < elemcount; i += 2) + { + uint16x8_t _v02 = vld1q_u16(ptr0); + uint16x8_t _v13 = vld1q_u16(ptr1); + float32x4_t _v0 = bfloat2float(vget_low_u16(_v02)); + float32x4_t _v1 = bfloat2float(vget_low_u16(_v13)); + float32x4_t _v2 = bfloat2float(vget_high_u16(_v02)); + float32x4_t _v3 = bfloat2float(vget_high_u16(_v13)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + _v2 = vmulq_f32(_v2, _scale0); + _v3 = vmulq_f32(_v3, _scale1); + vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3))); + ptr0 += 8; + ptr1 += 8; + s8ptr += 16; + } + for (; i < elemcount; i++) + { + float32x4_t _v0 = bfloat2float(vld1_u16(ptr0)); + float32x4_t _v1 = bfloat2float(vld1_u16(ptr1)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + vst1_s8(s8ptr, float2int8(_v0, _v1)); + ptr0 += 4; + ptr1 += 4; + s8ptr += 8; + } +} + +static void quantize_pack4to1_bf16s(const unsigned short* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to1_bf16s %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + float32x4_t _scale = vdupq_n_f32(scale); + if (scale_data_size > 1) + { + _scale = vld1q_f32((const float*)scale_data); + } + + int i = 0; + for (; i + 7 < elemcount; i += 8) + { + uint16x8_t _v01 = vld1q_u16(ptr); + uint16x8_t _v23 = vld1q_u16(ptr + 8); + uint16x8_t _v45 = vld1q_u16(ptr + 16); + uint16x8_t _v67 = vld1q_u16(ptr + 24); + float32x4_t _v0 = bfloat2float(vget_low_u16(_v01)); + float32x4_t _v1 = bfloat2float(vget_high_u16(_v01)); + float32x4_t _v2 = bfloat2float(vget_low_u16(_v23)); + float32x4_t _v3 = bfloat2float(vget_high_u16(_v23)); + float32x4_t _v4 = bfloat2float(vget_low_u16(_v45)); + float32x4_t _v5 = bfloat2float(vget_high_u16(_v45)); + float32x4_t _v6 = bfloat2float(vget_low_u16(_v67)); + float32x4_t _v7 = bfloat2float(vget_high_u16(_v67)); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + _v2 = vmulq_f32(_v2, _scale); + _v3 = vmulq_f32(_v3, _scale); + _v4 = vmulq_f32(_v4, _scale); + _v5 = vmulq_f32(_v5, _scale); + _v6 = vmulq_f32(_v6, _scale); + _v7 = vmulq_f32(_v7, _scale); + int8x8_t v0 = float2int8(_v0, _v1); + int8x8_t v1 = float2int8(_v2, _v3); + int8x8_t v2 = float2int8(_v4, _v5); + int8x8_t v3 = float2int8(_v6, _v7); + int8x16_t v01 = vcombine_s8(v0, v1); + int8x16_t v23 = vcombine_s8(v2, v3); + int8x16x2_t v0213 = vuzpq_s8(v01, v23); + int8x16x2_t v0123 = vuzpq_s8(v0213.val[0], v0213.val[1]); + vst1_s8(s8ptr0, vget_low_s8(v0123.val[0])); + vst1_s8(s8ptr1, vget_high_s8(v0123.val[0])); + vst1_s8(s8ptr2, vget_low_s8(v0123.val[1])); + vst1_s8(s8ptr3, vget_high_s8(v0123.val[1])); + ptr += 32; + s8ptr0 += 8; + s8ptr1 += 8; + s8ptr2 += 8; + s8ptr3 += 8; + } + for (; i < elemcount; i++) + { + float32x4_t _v = bfloat2float(vld1_u16(ptr)); + _v = vmulq_f32(_v, _scale); + int8x8_t v = float2int8(_v, _v); + s8ptr0[0] = vget_lane_s8(v, 0); + s8ptr1[0] = vget_lane_s8(v, 1); + s8ptr2[0] = vget_lane_s8(v, 2); + s8ptr3[0] = vget_lane_s8(v, 3); + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; + } +} +#endif // __ARM_NEON + int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int dims = bottom_blob.dims; @@ -244,11 +605,20 @@ int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Opt const int h = bottom_blob.h; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; - const size_t out_elemsize = elempack * 1u; if (dims == 1) { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -273,37 +643,127 @@ int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Opt if (dims == 2) { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) +#if __ARM_NEON + if (elempack == 4 && out_elempack == 8) { - const unsigned short* ptr = bottom_blob.row(i); - signed char* s8ptr = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const unsigned short* ptr0 = bottom_blob.row(i * 2); + const unsigned short* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* s8ptr = top_blob.row(i); - const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data; - quantize_bf16s(ptr, s8ptr, scale_data_i, w, elempack); + quantize_pack4to8_bf16s(ptr0, ptr1, s8ptr, scale_data_i, w); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const unsigned short* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_pack4to1_bf16s(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); + } + } +#endif // __ARM_NEON + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const unsigned short* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_bf16s(ptr, s8ptr, scale_data_i, w, elempack); + } } } if (dims == 3) { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __ARM_NEON + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const unsigned short* ptr0 = bottom_blob.channel(q * 2); + const unsigned short* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data; + + quantize_pack4to8_bf16s(ptr0, ptr1, s8ptr, scale_data_q, w * h); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const unsigned short* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_pack4to1_bf16s(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); + } + } +#endif // __ARM_NEON + if (elempack == out_elempack) { - const unsigned short* ptr = bottom_blob.channel(q); - signed char* s8ptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const unsigned short* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); - const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; - quantize_bf16s(ptr, s8ptr, scale_data_q, w * h, elempack); + quantize_bf16s(ptr, s8ptr, scale_data_q, w * h, elempack); + } } } diff --git a/src/layer/arm/quantize_arm_asimdhp.cpp b/src/layer/arm/quantize_arm_asimdhp.cpp index ce46fafabc3..3c3bcb836ec 100644 --- a/src/layer/arm/quantize_arm_asimdhp.cpp +++ b/src/layer/arm/quantize_arm_asimdhp.cpp @@ -41,6 +41,22 @@ static void quantize_fp16s(const __fp16* ptr, signed char* s8ptr, const Mat& sca } int i = 0; + for (; i + 15 < size; i += 16) + { + float16x8_t _v01 = vld1q_f16(ptr); + float16x8_t _v23 = vld1q_f16(ptr + 8); + float32x4_t _v0 = vcvt_f32_f16(vget_low_f16(_v01)); + float32x4_t _v1 = vcvt_f32_f16(vget_high_f16(_v01)); + float32x4_t _v2 = vcvt_f32_f16(vget_low_f16(_v23)); + float32x4_t _v3 = vcvt_f32_f16(vget_high_f16(_v23)); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + _v2 = vmulq_f32(_v2, _scale); + _v3 = vmulq_f32(_v3, _scale); + vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3))); + ptr += 16; + s8ptr += 16; + } for (; i + 7 < size; i += 8) { float16x8_t _v01 = vld1q_f16(ptr); @@ -73,6 +89,123 @@ static void quantize_fp16s(const __fp16* ptr, signed char* s8ptr, const Mat& sca } } +static void quantize_pack4to8_fp16s(const __fp16* ptr0, const __fp16* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to8_fp16s %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + float32x4_t _scale0 = vdupq_n_f32(scale); + float32x4_t _scale1 = _scale0; + if (scale_data_size > 1) + { + _scale0 = vld1q_f32((const float*)scale_data); + _scale1 = vld1q_f32((const float*)scale_data + 4); + } + + int i = 0; + for (; i + 1 < elemcount; i += 2) + { + float16x8_t _v02 = vld1q_f16(ptr0); + float16x8_t _v13 = vld1q_f16(ptr1); + float32x4_t _v0 = vcvt_f32_f16(vget_low_f16(_v02)); + float32x4_t _v1 = vcvt_f32_f16(vget_low_f16(_v13)); + float32x4_t _v2 = vcvt_f32_f16(vget_high_f16(_v02)); + float32x4_t _v3 = vcvt_f32_f16(vget_high_f16(_v13)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + _v2 = vmulq_f32(_v2, _scale0); + _v3 = vmulq_f32(_v3, _scale1); + vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3))); + ptr0 += 8; + ptr1 += 8; + s8ptr += 16; + } + for (; i < elemcount; i++) + { + float32x4_t _v0 = vcvt_f32_f16(vld1_f16(ptr0)); + float32x4_t _v1 = vcvt_f32_f16(vld1_f16(ptr1)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + vst1_s8(s8ptr, float2int8(_v0, _v1)); + ptr0 += 4; + ptr1 += 4; + s8ptr += 8; + } +} + +static void quantize_pack4to1_fp16s(const __fp16* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to1_fp16s %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + float32x4_t _scale = vdupq_n_f32(scale); + if (scale_data_size > 1) + { + _scale = vld1q_f32((const float*)scale_data); + } + + int i = 0; + for (; i + 7 < elemcount; i += 8) + { + float16x8_t _v01 = vld1q_f16(ptr); + float16x8_t _v23 = vld1q_f16(ptr + 8); + float16x8_t _v45 = vld1q_f16(ptr + 16); + float16x8_t _v67 = vld1q_f16(ptr + 24); + float32x4_t _v0 = vcvt_f32_f16(vget_low_f16(_v01)); + float32x4_t _v1 = vcvt_f32_f16(vget_high_f16(_v01)); + float32x4_t _v2 = vcvt_f32_f16(vget_low_f16(_v23)); + float32x4_t _v3 = vcvt_f32_f16(vget_high_f16(_v23)); + float32x4_t _v4 = vcvt_f32_f16(vget_low_f16(_v45)); + float32x4_t _v5 = vcvt_f32_f16(vget_high_f16(_v45)); + float32x4_t _v6 = vcvt_f32_f16(vget_low_f16(_v67)); + float32x4_t _v7 = vcvt_f32_f16(vget_high_f16(_v67)); + _v0 = vmulq_f32(_v0, _scale); + _v1 = vmulq_f32(_v1, _scale); + _v2 = vmulq_f32(_v2, _scale); + _v3 = vmulq_f32(_v3, _scale); + _v4 = vmulq_f32(_v4, _scale); + _v5 = vmulq_f32(_v5, _scale); + _v6 = vmulq_f32(_v6, _scale); + _v7 = vmulq_f32(_v7, _scale); + int8x8_t v0 = float2int8(_v0, _v1); + int8x8_t v1 = float2int8(_v2, _v3); + int8x8_t v2 = float2int8(_v4, _v5); + int8x8_t v3 = float2int8(_v6, _v7); + int8x16_t v01 = vcombine_s8(v0, v1); + int8x16_t v23 = vcombine_s8(v2, v3); + int8x16x2_t v0213 = vuzpq_s8(v01, v23); + int8x16x2_t v0123 = vuzpq_s8(v0213.val[0], v0213.val[1]); + vst1_s8(s8ptr0, vget_low_s8(v0123.val[0])); + vst1_s8(s8ptr1, vget_high_s8(v0123.val[0])); + vst1_s8(s8ptr2, vget_low_s8(v0123.val[1])); + vst1_s8(s8ptr3, vget_high_s8(v0123.val[1])); + ptr += 32; + s8ptr0 += 8; + s8ptr1 += 8; + s8ptr2 += 8; + s8ptr3 += 8; + } + for (; i < elemcount; i++) + { + float32x4_t _v = vcvt_f32_f16(vld1_f16(ptr)); + _v = vmulq_f32(_v, _scale); + int8x8_t v = float2int8(_v, _v); + s8ptr0[0] = vget_lane_s8(v, 0); + s8ptr1[0] = vget_lane_s8(v, 1); + s8ptr2[0] = vget_lane_s8(v, 2); + s8ptr3[0] = vget_lane_s8(v, 3); + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; + } +} + int Quantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int dims = bottom_blob.dims; @@ -80,11 +213,18 @@ int Quantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Opt const int h = bottom_blob.h; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; - const size_t out_elemsize = elempack * 1u; if (dims == 1) { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -109,37 +249,119 @@ int Quantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Opt if (dims == 2) { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const __fp16* ptr0 = bottom_blob.row(i * 2); + const __fp16* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data; + + quantize_pack4to8_fp16s(ptr0, ptr1, s8ptr, scale_data_i, w); + } + } + if (elempack == 4 && out_elempack == 1) { - const __fp16* ptr = bottom_blob.row(i); - signed char* s8ptr = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_pack4to1_fp16s(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); + } + } + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); - const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; - quantize_fp16s(ptr, s8ptr, scale_data_i, w, elempack); + quantize_fp16s(ptr, s8ptr, scale_data_i, w, elempack); + } } } if (dims == 3) { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (elempack == 4 && out_elempack == 8) { - const __fp16* ptr = bottom_blob.channel(q); - signed char* s8ptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q * 2); + const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* s8ptr = top_blob.channel(q); - const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data; - quantize_fp16s(ptr, s8ptr, scale_data_q, w * h, elempack); + quantize_pack4to8_fp16s(ptr0, ptr1, s8ptr, scale_data_q, w * h); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_pack4to1_fp16s(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); + } + } + if (elempack == out_elempack) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_fp16s(ptr, s8ptr, scale_data_q, w * h, elempack); + } } } @@ -201,6 +423,66 @@ static void quantize_fp16sa(const __fp16* ptr, signed char* s8ptr, const Mat& sc } } +static void quantize_pack4to1_fp16sa(const __fp16* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to1_fp16sa %d %d", scale_data_size, elemcount); + + __fp16 scale = (__fp16)scale_data[0]; + float16x4_t _scale = vdup_n_f16(scale); + if (scale_data_size > 1) + { + _scale = vcvt_f16_f32(vld1q_f32((const float*)scale_data)); + } + float16x8_t _scale01 = vcombine_f16(_scale, _scale); + + int i = 0; + for (; i + 7 < elemcount; i += 8) + { + float16x8_t _v01 = vld1q_f16(ptr); + float16x8_t _v23 = vld1q_f16(ptr + 8); + float16x8_t _v45 = vld1q_f16(ptr + 16); + float16x8_t _v67 = vld1q_f16(ptr + 24); + _v01 = vmulq_f16(_v01, _scale01); + _v23 = vmulq_f16(_v23, _scale01); + _v45 = vmulq_f16(_v45, _scale01); + _v67 = vmulq_f16(_v67, _scale01); + int8x8_t v0 = float2int8(_v01); + int8x8_t v1 = float2int8(_v23); + int8x8_t v2 = float2int8(_v45); + int8x8_t v3 = float2int8(_v67); + int8x16_t v01 = vcombine_s8(v0, v1); + int8x16_t v23 = vcombine_s8(v2, v3); + int8x16x2_t v0213 = vuzpq_s8(v01, v23); + int8x16x2_t v0123 = vuzpq_s8(v0213.val[0], v0213.val[1]); + vst1_s8(s8ptr0, vget_low_s8(v0123.val[0])); + vst1_s8(s8ptr1, vget_high_s8(v0123.val[0])); + vst1_s8(s8ptr2, vget_low_s8(v0123.val[1])); + vst1_s8(s8ptr3, vget_high_s8(v0123.val[1])); + ptr += 32; + s8ptr0 += 8; + s8ptr1 += 8; + s8ptr2 += 8; + s8ptr3 += 8; + } + for (; i < elemcount; i++) + { + float16x4_t _v = vld1_f16(ptr); + _v = vmul_f16(_v, _scale); + int8x8_t v = float2int8(vcombine_f16(_v, _v)); + s8ptr0[0] = vget_lane_s8(v, 0); + s8ptr1[0] = vget_lane_s8(v, 1); + s8ptr2[0] = vget_lane_s8(v, 2); + s8ptr3[0] = vget_lane_s8(v, 3); + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; + } +} + int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int dims = bottom_blob.dims; @@ -208,11 +490,18 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op const int h = bottom_blob.h; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; - const size_t out_elemsize = elempack * 1u; if (dims == 1) { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -237,37 +526,91 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op if (dims == 2) { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_pack4to1_fp16sa(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); + } + } + if (elempack == out_elempack) { - const __fp16* ptr = bottom_blob.row(i); - signed char* s8ptr = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); - const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; - quantize_fp16sa(ptr, s8ptr, scale_data_i, w, elempack); + quantize_fp16sa(ptr, s8ptr, scale_data_i, w, elempack); + } } } if (dims == 3) { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_pack4to1_fp16sa(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); + } + } + if (elempack == out_elempack) { - const __fp16* ptr = bottom_blob.channel(q); - signed char* s8ptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); - const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; - quantize_fp16sa(ptr, s8ptr, scale_data_q, w * h, elempack); + quantize_fp16sa(ptr, s8ptr, scale_data_q, w * h, elempack); + } } }