diff --git a/src/amalgam/gen/avx.c b/src/amalgam/gen/avx.c index 94abab20153..5dac1f94185 100644 --- a/src/amalgam/gen/avx.c +++ b/src/amalgam/gen/avx.c @@ -6402,8 +6402,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__avx_ld128( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -6430,7 +6430,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__avx_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -6554,8 +6554,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__avx_ld128( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -6603,7 +6603,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__avx_ld128( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -6744,8 +6744,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__avx_ld128( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -6766,7 +6766,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__avx_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -6864,8 +6864,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__avx_ld128( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -6893,7 +6893,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__avx_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -8832,8 +8832,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -8854,7 +8854,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -8945,8 +8945,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -8974,7 +8974,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -10945,8 +10945,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -10967,7 +10967,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -11059,8 +11059,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -11088,7 +11088,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -13706,10 +13706,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -13728,7 +13728,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -13818,10 +13818,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -13847,7 +13847,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/amalgam/gen/avx2.c b/src/amalgam/gen/avx2.c index 34d4eb45cbb..911886dae96 100644 --- a/src/amalgam/gen/avx2.c +++ b/src/amalgam/gen/avx2.c @@ -3233,8 +3233,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx2( __m256i vacc0x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point0); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -3289,7 +3289,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx2( vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); @@ -3406,8 +3406,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8c8__avx2( __m256i vacc1x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point1); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -3469,7 +3469,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8c8__avx2( vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); @@ -3596,8 +3596,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx2( __m256i vacc0x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point0); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -3616,7 +3616,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx2( vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); @@ -3742,8 +3742,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2( __m256i vacc2x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point2); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -3776,7 +3776,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2( vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); @@ -5229,8 +5229,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2( __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -5249,7 +5249,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2( vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); @@ -5376,8 +5376,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2( __m256i vacc2x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -5410,7 +5410,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2( vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); @@ -6967,8 +6967,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx2( __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -6987,7 +6987,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx2( vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); @@ -7115,8 +7115,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__avx2( __m256i vacc2x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -7149,7 +7149,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__avx2( vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); @@ -8943,6 +8943,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2( const uint8_t* a0 = a; uint8_t* c0 = c; + const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point); do { const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]); const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]); @@ -8958,9 +8959,8 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2( __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1); w = (const int32_t*) w + 8; - size_t k = 0; - const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -8979,7 +8979,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2( vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); w = (const uint8_t*) w + 64; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); @@ -9083,6 +9083,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2( c2 = c1; } + const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point); do { const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]); const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]); @@ -9106,9 +9107,8 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2( __m256i vacc2x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -9141,7 +9141,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2( vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67)); w = (const uint8_t*) w + 64; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/amalgam/gen/sse2.c b/src/amalgam/gen/sse2.c index 9ca188557c1..f549d483ca2 100644 --- a/src/amalgam/gen/sse2.c +++ b/src/amalgam/gen/sse2.c @@ -4070,8 +4070,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse2_ld128( __m128i vacc0x3 = _mm_unpackhi_epi64(vksum230, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -4098,7 +4098,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse2_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); @@ -4261,8 +4261,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse2_ld128( __m128i vacc3x3 = _mm_unpackhi_epi64(vksum233, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -4310,7 +4310,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse2_ld128( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); @@ -4470,8 +4470,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse2_ld64( __m128i vacc0x3 = _mm_unpackhi_epi64(vksum230, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -4494,7 +4494,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse2_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); @@ -4636,8 +4636,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse2_ld64( __m128i vacc2x3 = _mm_unpackhi_epi64(vksum232, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -4674,7 +4674,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse2_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); @@ -7043,8 +7043,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -7067,7 +7067,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); @@ -7169,8 +7169,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -7207,7 +7207,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); @@ -9050,8 +9050,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -9074,7 +9074,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); @@ -9177,8 +9177,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -9215,7 +9215,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); @@ -12861,10 +12861,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); a0 += 8; @@ -12887,7 +12887,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); @@ -12987,10 +12987,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); a0 += 8; @@ -13027,7 +13027,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/amalgam/gen/sse41.c b/src/amalgam/gen/sse41.c index 6898209818b..02cf8d1c636 100644 --- a/src/amalgam/gen/sse41.c +++ b/src/amalgam/gen/sse41.c @@ -2379,8 +2379,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse41_ld128( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -2407,7 +2407,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse41_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -2521,8 +2521,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse41_ld128( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -2563,7 +2563,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse41_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -2688,8 +2688,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse41_ld64( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -2712,7 +2712,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse41_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -2824,8 +2824,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse41_ld64( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -2862,7 +2862,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse41_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -4990,8 +4990,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -5014,7 +5014,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -5115,8 +5115,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -5153,7 +5153,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -6915,8 +6915,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -6939,7 +6939,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -7041,8 +7041,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -7079,7 +7079,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -9859,9 +9859,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -9884,7 +9884,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -9984,9 +9984,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -10023,7 +10023,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/amalgam/gen/xop.c b/src/amalgam/gen/xop.c index 1d78a0ea9d4..4a4d24ca129 100644 --- a/src/amalgam/gen/xop.c +++ b/src/amalgam/gen/xop.c @@ -61,8 +61,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__xop_ld128( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -85,7 +85,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__xop_ld128( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -212,8 +212,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__xop_ld128( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -257,7 +257,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__xop_ld128( vacc3x3 = _mm_maddd_epi16(vxa3, vxb3, vacc3x3); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -398,8 +398,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__xop_ld64( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -422,7 +422,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__xop_ld64( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -520,8 +520,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__xop_ld64( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -551,7 +551,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__xop_ld64( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -2306,8 +2306,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -2330,7 +2330,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -2421,8 +2421,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -2452,7 +2452,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -4431,8 +4431,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -4455,7 +4455,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -4547,8 +4547,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -4578,7 +4578,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -6171,9 +6171,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -6196,7 +6196,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); @@ -6286,9 +6286,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -6318,7 +6318,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-avx-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-avx-ld128.c index 49203cdfad0..c14786b5bf1 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-avx-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-avx-ld128.c @@ -58,8 +58,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__avx_ld128( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -86,7 +86,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__avx_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-avx-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-avx-ld64.c index 7acb0a80f2f..1fd8dba4887 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-avx-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-avx-ld64.c @@ -58,8 +58,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__avx_ld64( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -94,7 +94,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__avx_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse2-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse2-ld128.c index 28701fde985..8d7efa5bb8a 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse2-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse2-ld128.c @@ -71,8 +71,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse2_ld128( __m128i vacc0x3 = _mm_unpackhi_epi64(vksum230, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -99,7 +99,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse2_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse2-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse2-ld64.c index 44cdc0250ec..6202c00151e 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse2-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse2-ld64.c @@ -71,8 +71,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse2_ld64( __m128i vacc0x3 = _mm_unpackhi_epi64(vksum230, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -111,7 +111,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse2_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse41-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse41-ld128.c index 56a6d15b0d3..c3a4b35fd3c 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse41-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse41-ld128.c @@ -55,8 +55,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse41_ld128( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -83,7 +83,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse41_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse41-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse41-ld64.c index 00ecca34f4c..a04cf9fbd64 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse41-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-sse41-ld64.c @@ -55,8 +55,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse41_ld64( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -91,7 +91,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse41_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-xop-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-xop-ld128.c index 455b665ea99..471bc4fd09d 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-xop-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-xop-ld128.c @@ -61,8 +61,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__xop_ld128( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -85,7 +85,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__xop_ld128( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-xop-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-xop-ld64.c index 9bb331e43bf..164b0404ba4 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-xop-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-xop-ld64.c @@ -60,8 +60,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__xop_ld64( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -92,7 +92,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__xop_ld64( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x8c8-minmax-avx2.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x8c8-minmax-avx2.c index ace2bbbbb94..78ef7c95858 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x8c8-minmax-avx2.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x8c8-minmax-avx2.c @@ -64,8 +64,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx2( __m256i vacc0x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point0); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -120,7 +120,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx2( vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-avx-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-avx-ld128.c index a3b1fcf6e4a..47f6b34f98e 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-avx-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-avx-ld128.c @@ -70,8 +70,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__avx_ld128( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -105,7 +105,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__avx_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-avx-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-avx-ld64.c index b20da168ac8..ecb1a448ef5 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-avx-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-avx-ld64.c @@ -70,8 +70,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__avx_ld64( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -113,7 +113,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__avx_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse2-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse2-ld128.c index de52ba3837b..56ea5566407 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse2-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse2-ld128.c @@ -90,8 +90,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse2_ld128( __m128i vacc1x3 = _mm_unpackhi_epi64(vksum231, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -125,7 +125,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse2_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse2-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse2-ld64.c index ccee484bb1e..f5a57cb59f3 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse2-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse2-ld64.c @@ -90,8 +90,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse2_ld64( __m128i vacc1x3 = _mm_unpackhi_epi64(vksum231, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -137,7 +137,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse2_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse41-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse41-ld128.c index 73fafd2e35a..a691890cc60 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse41-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse41-ld128.c @@ -67,8 +67,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse41_ld128( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -102,7 +102,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse41_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse41-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse41-ld64.c index c4e61236ff7..07970762ac2 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse41-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-sse41-ld64.c @@ -67,8 +67,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse41_ld64( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -110,7 +110,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse41_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-xop-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-xop-ld128.c index 1c1258ecdf4..24482d5cdd9 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-xop-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-xop-ld128.c @@ -73,8 +73,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__xop_ld128( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -104,7 +104,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__xop_ld128( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-xop-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-xop-ld64.c index 5302561f627..2dab5e03829 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-xop-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-xop-ld64.c @@ -72,8 +72,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__xop_ld64( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -111,7 +111,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__xop_ld64( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x8c8-minmax-avx2.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x8c8-minmax-avx2.c index c8dbf01f64f..1989deb07d5 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x8c8-minmax-avx2.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x8c8-minmax-avx2.c @@ -75,8 +75,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8c8__avx2( __m256i vacc1x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point1); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -138,7 +138,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8c8__avx2( vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-avx-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-avx-ld128.c index 87375cf62f5..d8fafb48e09 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-avx-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-avx-ld128.c @@ -82,8 +82,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__avx_ld128( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -124,7 +124,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__avx_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-avx-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-avx-ld64.c index 07313a2c6f6..2052f30f591 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-avx-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-avx-ld64.c @@ -82,8 +82,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__avx_ld64( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -132,7 +132,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__avx_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse2-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse2-ld128.c index b41f07fdb18..2d41d107c5e 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse2-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse2-ld128.c @@ -110,8 +110,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse2_ld128( __m128i vacc2x3 = _mm_unpackhi_epi64(vksum232, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -152,7 +152,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse2_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse2-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse2-ld64.c index ae4830d3bac..dd1802d0142 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse2-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse2-ld64.c @@ -110,8 +110,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse2_ld64( __m128i vacc2x3 = _mm_unpackhi_epi64(vksum232, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -164,7 +164,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse2_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse41-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse41-ld128.c index 9acbfe0fcd3..0276114576b 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse41-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse41-ld128.c @@ -80,8 +80,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse41_ld128( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -122,7 +122,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse41_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse41-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse41-ld64.c index 1837af06e58..6d7215086aa 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse41-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-sse41-ld64.c @@ -80,8 +80,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse41_ld64( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -130,7 +130,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse41_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-xop-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-xop-ld128.c index 4d1d7511efe..cd2e1d0bcc0 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-xop-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-xop-ld128.c @@ -85,8 +85,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__xop_ld128( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -123,7 +123,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__xop_ld128( vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-xop-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-xop-ld64.c index 0140ecbfcb7..0a87691be76 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-xop-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-xop-ld64.c @@ -84,8 +84,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__xop_ld64( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -130,7 +130,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__xop_ld64( vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x8c8-minmax-avx2.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x8c8-minmax-avx2.c index c9842a4869a..187d71321b6 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x8c8-minmax-avx2.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x8c8-minmax-avx2.c @@ -86,8 +86,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x8c8__avx2( __m256i vacc2x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point2); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -156,7 +156,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x8c8__avx2( vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-avx-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-avx-ld128.c index d16572659b1..1085e980f72 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-avx-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-avx-ld128.c @@ -94,8 +94,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__avx_ld128( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -143,7 +143,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__avx_ld128( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-avx-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-avx-ld64.c index e88944acad9..4a0caf13136 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-avx-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-avx-ld64.c @@ -94,8 +94,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__avx_ld64( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -151,7 +151,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__avx_ld64( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse2-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse2-ld128.c index 86d1f371c5d..4e22a9852cd 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse2-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse2-ld128.c @@ -129,8 +129,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse2_ld128( __m128i vacc3x3 = _mm_unpackhi_epi64(vksum233, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -178,7 +178,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse2_ld128( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse2-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse2-ld64.c index a1a40296bbb..ad701b9b817 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse2-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse2-ld64.c @@ -129,8 +129,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse2_ld64( __m128i vacc3x3 = _mm_unpackhi_epi64(vksum233, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -190,7 +190,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse2_ld64( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse41-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse41-ld128.c index 91b07259669..85ce4207149 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse41-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse41-ld128.c @@ -92,8 +92,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse41_ld128( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -141,7 +141,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse41_ld128( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse41-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse41-ld64.c index b340dd87359..50dfc69ba1c 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse41-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-sse41-ld64.c @@ -92,8 +92,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse41_ld64( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -149,7 +149,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse41_ld64( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-xop-ld128.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-xop-ld128.c index 81fbf11a91e..b11642dcc83 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-xop-ld128.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-xop-ld128.c @@ -97,8 +97,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__xop_ld128( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -142,7 +142,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__xop_ld128( vacc3x3 = _mm_maddd_epi16(vxa3, vxb3, vacc3x3); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-xop-ld64.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-xop-ld64.c index 4a2c715fe55..247645244ef 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-xop-ld64.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-xop-ld64.c @@ -96,8 +96,8 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__xop_ld64( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -149,7 +149,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__xop_ld64( vacc3x3 = _mm_maddd_epi16(vxa3, vxb3, vacc3x3); w = (const int8_t*) w + 16; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-avx512vnni.c index bb2f7262cb3..879b99d332b 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-avx512vnni.c @@ -53,9 +53,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - a0 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -66,7 +65,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-avx-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-avx-ld128.c index 74a449a24b9..3c1d8424de6 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-avx-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-avx-ld128.c @@ -56,8 +56,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__avx_ld128( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -78,7 +78,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__avx_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-avx-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-avx-ld64.c index 7462c34a628..afd1a69bf5e 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-avx-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-avx-ld64.c @@ -56,8 +56,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__avx_ld64( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -80,7 +80,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__avx_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse2-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse2-ld128.c index 93f95943392..6799386f479 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse2-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse2-ld128.c @@ -69,8 +69,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse2_ld128( __m128i vacc0x3 = _mm_unpackhi_epi64(vksum230, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -91,7 +91,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse2_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse2-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse2-ld64.c index e0a8da6dd2f..b671ed28058 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse2-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse2-ld64.c @@ -69,8 +69,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse2_ld64( __m128i vacc0x3 = _mm_unpackhi_epi64(vksum230, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -93,7 +93,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse2_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse41-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse41-ld128.c index c9dd6156f84..6c4001b1fbe 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse41-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse41-ld128.c @@ -53,8 +53,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse41_ld128( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -75,7 +75,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse41_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse41-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse41-ld64.c index 67f7dee0243..f9fc8379aad 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse41-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-sse41-ld64.c @@ -53,8 +53,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse41_ld64( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -77,7 +77,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse41_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-xop-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-xop-ld128.c index 684db253102..ab764d3734e 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-xop-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-xop-ld128.c @@ -56,8 +56,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__xop_ld128( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -78,7 +78,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__xop_ld128( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-xop-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-xop-ld64.c index 41f9b710441..7cf92b667ad 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-xop-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c8-minmax-xop-ld64.c @@ -56,8 +56,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__xop_ld64( __m128i vacc0x3 = _mm_blend_epi16(vinit0, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -80,7 +80,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__xop_ld64( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c8-minmax-avx2.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c8-minmax-avx2.c index 3b9dbdc321a..c88236dba7e 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c8-minmax-avx2.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c8-minmax-avx2.c @@ -63,8 +63,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx2( __m256i vacc0x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point0); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -83,7 +83,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx2( vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x16c4-minmax-avx512vnni.c index 4c300496b93..257a2ea0070 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x16c4-minmax-avx512vnni.c @@ -61,11 +61,10 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -78,7 +77,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-avx-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-avx-ld128.c index 303663173d7..76180055c34 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-avx-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-avx-ld128.c @@ -68,8 +68,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__avx_ld128( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -97,7 +97,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__avx_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-avx-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-avx-ld64.c index a6e7fdba6b9..10e70b7b469 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-avx-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-avx-ld64.c @@ -68,8 +68,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__avx_ld64( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -99,7 +99,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__avx_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse2-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse2-ld128.c index 40127883246..d9bf5c5d4af 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse2-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse2-ld128.c @@ -88,8 +88,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__sse2_ld128( __m128i vacc1x3 = _mm_unpackhi_epi64(vksum231, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -117,7 +117,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__sse2_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse2-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse2-ld64.c index b0834c1dd89..2c23e458bda 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse2-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse2-ld64.c @@ -88,8 +88,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__sse2_ld64( __m128i vacc1x3 = _mm_unpackhi_epi64(vksum231, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -119,7 +119,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__sse2_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse41-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse41-ld128.c index d4ecbc552b5..a8e00bf6f54 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse41-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse41-ld128.c @@ -65,8 +65,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__sse41_ld128( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -94,7 +94,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__sse41_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse41-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse41-ld64.c index f7eaacac7c6..b2068f8c30f 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse41-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-sse41-ld64.c @@ -65,8 +65,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__sse41_ld64( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -96,7 +96,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__sse41_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-xop-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-xop-ld128.c index 67b06cea6f8..09331cd34f0 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-xop-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-xop-ld128.c @@ -68,8 +68,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__xop_ld128( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -97,7 +97,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__xop_ld128( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-xop-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-xop-ld64.c index 156c821a11e..8c72ff9a7a7 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-xop-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c8-minmax-xop-ld64.c @@ -68,8 +68,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__xop_ld64( __m128i vacc1x3 = _mm_blend_epi16(vinit1, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -99,7 +99,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__xop_ld64( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c8-minmax-avx2.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c8-minmax-avx2.c index f710283e34c..ec59a58c159 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c8-minmax-avx2.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c8-minmax-avx2.c @@ -74,8 +74,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c8__avx2( __m256i vacc1x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point1); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -101,7 +101,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c8__avx2( vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x16c4-minmax-avx512vnni.c index cc73bc42cdd..b905140ed8b 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x16c4-minmax-avx512vnni.c @@ -69,13 +69,12 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; + __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); a2 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -90,7 +89,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-avx-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-avx-ld128.c index 50798954841..ff3b12b3220 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-avx-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-avx-ld128.c @@ -80,8 +80,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__avx_ld128( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -116,7 +116,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__avx_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-avx-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-avx-ld64.c index 04a95167d9e..2005164e488 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-avx-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-avx-ld64.c @@ -80,8 +80,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__avx_ld64( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -118,7 +118,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__avx_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse2-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse2-ld128.c index d6d892a0642..146c235c56d 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse2-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse2-ld128.c @@ -108,8 +108,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse2_ld128( __m128i vacc2x3 = _mm_unpackhi_epi64(vksum232, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -144,7 +144,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse2_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse2-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse2-ld64.c index 18d35ddfaeb..c18cc5b647d 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse2-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse2-ld64.c @@ -108,8 +108,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse2_ld64( __m128i vacc2x3 = _mm_unpackhi_epi64(vksum232, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -146,7 +146,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse2_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse41-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse41-ld128.c index 33ecd49b9a9..5fc4b07accb 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse41-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse41-ld128.c @@ -78,8 +78,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse41_ld128( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -114,7 +114,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse41_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse41-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse41-ld64.c index bdee4e12497..40cff143f46 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse41-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-sse41-ld64.c @@ -78,8 +78,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse41_ld64( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -116,7 +116,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse41_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-xop-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-xop-ld128.c index 2c98d015767..7aca09f5a9e 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-xop-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-xop-ld128.c @@ -80,8 +80,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__xop_ld128( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -116,7 +116,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__xop_ld128( vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-xop-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-xop-ld64.c index eea45d3fdb9..61a0d41e2b8 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-xop-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c8-minmax-xop-ld64.c @@ -80,8 +80,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__xop_ld64( __m128i vacc2x3 = _mm_blend_epi16(vinit2, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -118,7 +118,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__xop_ld64( vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c8-minmax-avx2.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c8-minmax-avx2.c index 0ed8b08a786..da9658c7b2c 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c8-minmax-avx2.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c8-minmax-avx2.c @@ -85,8 +85,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2( __m256i vacc2x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point2); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -119,7 +119,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2( vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-avx512vnni.c index e066846e285..a558ae4ed12 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-avx512vnni.c @@ -77,15 +77,14 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); - __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; + __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); a2 += 4; + __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); a3 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -102,7 +101,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-avx-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-avx-ld128.c index 7a327212411..65d892a8af6 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-avx-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-avx-ld128.c @@ -92,8 +92,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__avx_ld128( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -135,7 +135,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__avx_ld128( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-avx-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-avx-ld64.c index 8bb902614f7..047c8b74a1f 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-avx-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-avx-ld64.c @@ -92,8 +92,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__avx_ld64( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -137,7 +137,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__avx_ld64( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse2-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse2-ld128.c index 7594cf3e0ab..8613b87d65b 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse2-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse2-ld128.c @@ -127,8 +127,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse2_ld128( __m128i vacc3x3 = _mm_unpackhi_epi64(vksum233, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -170,7 +170,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse2_ld128( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse2-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse2-ld64.c index eea666470d6..906d9990a20 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse2-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse2-ld64.c @@ -127,8 +127,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse2_ld64( __m128i vacc3x3 = _mm_unpackhi_epi64(vksum233, vzero); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -172,7 +172,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse2_ld64( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse41-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse41-ld128.c index 78a8db9304e..7bea578cc28 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse41-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse41-ld128.c @@ -90,8 +90,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse41_ld128( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -133,7 +133,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse41_ld128( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse41-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse41-ld64.c index 27f78058764..275c11c7b34 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse41-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-sse41-ld64.c @@ -90,8 +90,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse41_ld64( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -135,7 +135,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse41_ld64( vacc3x3 = _mm_add_epi32(vacc3x3, _mm_madd_epi16(vxa3, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-xop-ld128.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-xop-ld128.c index 464252a6e85..437b16f9cfb 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-xop-ld128.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-xop-ld128.c @@ -92,8 +92,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__xop_ld128( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -135,7 +135,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__xop_ld128( vacc3x3 = _mm_maddd_epi16(vxa3, vxb3, vacc3x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-xop-ld64.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-xop-ld64.c index 1e113d784de..985bd89c609 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-xop-ld64.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c8-minmax-xop-ld64.c @@ -92,8 +92,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__xop_ld64( __m128i vacc3x3 = _mm_blend_epi16(vinit3, vzero, 0x3F); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -137,7 +137,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__xop_ld64( vacc3x3 = _mm_maddd_epi16(vxa3, vxb3, vacc3x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x16c4-minmax-avx512vnni.c index 12f39985a9f..514f7ba04d5 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x16c4-minmax-avx512vnni.c @@ -85,17 +85,16 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); - __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); - __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; + __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); a2 += 4; + __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); a3 += 4; + __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); a4 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -114,7 +113,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x16c4-minmax-avx512vnni.c index 2ecae695782..ee571ccbc1c 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x16c4-minmax-avx512vnni.c @@ -93,19 +93,18 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); - __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); - __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); - __m512i va5x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a5)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; + __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); a2 += 4; + __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); a3 += 4; + __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); a4 += 4; + __m512i va5x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a5)); a5 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -126,7 +125,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x16c4-minmax-avx512vnni.c index 5a1d9d405db..6db2a2feae9 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x16c4-minmax-avx512vnni.c @@ -101,21 +101,20 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); - __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); - __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); - __m512i va5x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a5)); - __m512i va6x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a6)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; + __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); a2 += 4; + __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); a3 += 4; + __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); a4 += 4; + __m512i va5x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a5)); a5 += 4; + __m512i va6x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a6)); a6 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -138,7 +137,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x16c4-minmax-avx512vnni.c index a76e8aafb14..dbf595306e9 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x16c4-minmax-avx512vnni.c @@ -109,23 +109,22 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); - __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); - __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); - __m512i va5x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a5)); - __m512i va6x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a6)); - __m512i va7x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a7)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; + __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); a2 += 4; + __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); a3 += 4; + __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); a4 += 4; + __m512i va5x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a5)); a5 += 4; + __m512i va6x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a6)); a6 += 4; + __m512i va7x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a7)); a7 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -150,7 +149,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qs8-gemm/MRx16c4-avx512vnni.c.in b/src/qs8-gemm/MRx16c4-avx512vnni.c.in index c32fbf5662b..0a6ac0dbb8e 100644 --- a/src/qs8-gemm/MRx16c4-avx512vnni.c.in +++ b/src/qs8-gemm/MRx16c4-avx512vnni.c.in @@ -83,6 +83,8 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x16c4__ const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->${PARAMS_STRUCT}.output_zero_point); const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min); const __m128i vshuffle_control_mask = _mm_loadu_si128((const __m128i*) params->${PARAMS_STRUCT}.shuffle_control_mask); + $if DATATYPE == "QU8": + const __m512i vb_zero_point = _mm512_load_si512(params->${PARAMS_STRUCT}.kernel_zero_point); do { $if DATATYPE == "QD8": const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); @@ -95,13 +97,9 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x16c4__ w = (const int32_t*) w + 16; size_t k = kc; - $if DATATYPE == "QU8": - const __m512i vb_zero_point = _mm512_load_si512(params->${PARAMS_STRUCT}.kernel_zero_point); - do { + while (k >= 4 * sizeof(${XINT8_T})) { $for M in range(MR): __m512i va${M}x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a${M})); - - $for M in range(MR): a${M} += 4; $for M in range(MR): @@ -114,7 +112,7 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x16c4__ w = (const ${XINT8_T}*) w + 64; k -= 4 * sizeof(${XINT8_T}); - } while (k != 0); + } $for M in range(MR): __m512 vscaled${M}x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc${M}x0123456789ABCDEF); diff --git a/src/qs8-gemm/MRx4c8-sse.c.in b/src/qs8-gemm/MRx4c8-sse.c.in index 8055204d0f5..972258e6b88 100644 --- a/src/qs8-gemm/MRx4c8-sse.c.in +++ b/src/qs8-gemm/MRx4c8-sse.c.in @@ -158,12 +158,12 @@ void xnn_${DATATYPE_SPEC}_gemm${GEMM_SUFFIX}_minmax${REQUANTIZATION_SPEC}_ukerne __m128i vacc${M}x${N} = vacc0x${N}; w = (const int32_t*) w + 4; - size_t k = 0; $if DATATYPE == "QU8": const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.kernel_zero_point); $if SSE < 4 or VARIANT == "LD128": const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(${XINT8_T})) { $for M in range(MR): const __m128i va${M} = _mm_loadl_epi64((const __m128i*) a${M}); $if DATATYPE == "QU8": @@ -298,7 +298,7 @@ void xnn_${DATATYPE_SPEC}_gemm${GEMM_SUFFIX}_minmax${REQUANTIZATION_SPEC}_ukerne w = (const ${XINT8_T}*) w + 16; $else: w = (const ${XINT8_T}*) w + 32; - k += 8 * sizeof(${XINT8_T}); + k -= 8 * sizeof(${XINT8_T}); } $if SSE >= 3: diff --git a/src/qs8-gemm/MRx8c8-avx2.c.in b/src/qs8-gemm/MRx8c8-avx2.c.in index 69fc6b68111..ab876d7947e 100644 --- a/src/qs8-gemm/MRx8c8-avx2.c.in +++ b/src/qs8-gemm/MRx8c8-avx2.c.in @@ -75,6 +75,8 @@ void xnn_${DATATYPE_SPEC}_gemm${GEMM_SUFFIX}_minmax${REQUANTIZATION_SPEC}_ukerne c${M} = c${M-1}; } + $if DATATYPE == "QU8": + const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->${PARAMS_STRUCT}.kernel_zero_point); $if DATATYPE == "QC4": const __m128i vmask = _mm_load_si128((const __m128i*) params->avx.mask); // 0xF0 do { @@ -97,10 +99,8 @@ void xnn_${DATATYPE_SPEC}_gemm${GEMM_SUFFIX}_minmax${REQUANTIZATION_SPEC}_ukerne __m256i vacc${M}x${ABC[N:N+2]} = vacc0x${ABC[N:N+2]}; w = (const int32_t*) w + 8; - size_t k = 0; - $if DATATYPE == "QU8": - const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->${PARAMS_STRUCT}.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(${XINT8_T})) { $for M in range(MR): const __m128i va${M} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) a${M})); const __m256i vxa${M} = _mm256_inserti128_si256(_mm256_castsi128_si256(va${M}), va${M}, 1); @@ -143,7 +143,7 @@ void xnn_${DATATYPE_SPEC}_gemm${GEMM_SUFFIX}_minmax${REQUANTIZATION_SPEC}_ukerne w = (const ${XINT8_T}*) w + 32; $else: w = (const ${XINT8_T}*) w + 64; - k += 8 * sizeof(${XINT8_T}); + k -= 8 * sizeof(${XINT8_T}); } $for M in range(MR): diff --git a/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-avx512vnni.c b/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-avx512vnni.c index 712800b0632..63dac86f528 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-avx512vnni.c @@ -53,9 +53,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - a0 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -66,7 +65,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-avx-ld128.c index d77168510b4..b90500c2990 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-avx-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-avx-ld128.c @@ -52,8 +52,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -74,7 +74,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-avx-ld64.c index 22a34159a22..48bc28ac56d 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-avx-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-avx-ld64.c @@ -52,8 +52,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -76,7 +76,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse2-ld128.c index b3c3bda373e..2d72c710e48 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse2-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse2-ld128.c @@ -48,8 +48,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -70,7 +70,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse2-ld64.c index 76d31f21bf0..5862f16c994 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse2-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse2-ld64.c @@ -48,8 +48,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -72,7 +72,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse41-ld128.c index cd8c49b1974..3e5c83d1cf7 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse41-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse41-ld128.c @@ -48,8 +48,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -70,7 +70,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse41-ld64.c index fce5500842f..50bd79c51a6 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse41-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse41-ld64.c @@ -48,8 +48,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -72,7 +72,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-ssse3-ld128.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-ssse3-ld128.c index 9f955faf4ac..2d5b4e87a7a 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-ssse3-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-ssse3-ld128.c @@ -48,8 +48,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -70,7 +70,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-ssse3-ld64.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-ssse3-ld64.c index 076498f8024..6732f38ee9d 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-ssse3-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-ssse3-ld64.c @@ -48,8 +48,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -72,7 +72,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-xop-ld128.c index 4c2a05aa1a6..e42d75a978d 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-xop-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-xop-ld128.c @@ -52,8 +52,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -74,7 +74,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-xop-ld64.c index e025b8c66d8..1a09f80e25b 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-xop-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-xop-ld64.c @@ -52,8 +52,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -76,7 +76,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-avx.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-avx.c index 3824a702c3d..4e2e1ce18c4 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-avx.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-avx.c @@ -52,8 +52,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -72,7 +72,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-sse2.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-sse2.c index b86c524d84e..ed1c1649b20 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-sse2.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-sse2.c @@ -48,8 +48,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -68,7 +68,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-sse41.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-sse41.c index 085669f66eb..0841a096608 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-sse41.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-sse41.c @@ -48,8 +48,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -68,7 +68,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-ssse3.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-ssse3.c index 06275d612a4..d3b18913dde 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-ssse3.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-ssse3.c @@ -48,8 +48,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -68,7 +68,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-xop.c b/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-xop.c index 3f7f4864096..8ab3188813b 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-xop.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-xop.c @@ -52,8 +52,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -72,7 +72,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-avx2.c b/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-avx2.c index 6d57be45729..b4082ee439f 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-avx2.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-avx2.c @@ -57,8 +57,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2( __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -77,7 +77,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2( vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qs8-gemm/gen/qs8-gemm-1x8c8-xw-minmax-fp32-avx2.c b/src/qs8-gemm/gen/qs8-gemm-1x8c8-xw-minmax-fp32-avx2.c index c3acb2e0abc..193d03fd1e5 100644 --- a/src/qs8-gemm/gen/qs8-gemm-1x8c8-xw-minmax-fp32-avx2.c +++ b/src/qs8-gemm/gen/qs8-gemm-1x8c8-xw-minmax-fp32-avx2.c @@ -57,8 +57,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2( __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -77,7 +77,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2( vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); w = (const int16_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-fp32-avx512vnni.c b/src/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-fp32-avx512vnni.c index adb4c5af75d..bd1dd592002 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-fp32-avx512vnni.c @@ -60,11 +60,10 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -77,7 +76,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-avx-ld128.c index 98b4492d606..29fa296f744 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-avx-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-avx-ld128.c @@ -62,8 +62,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -91,7 +91,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-avx-ld64.c index 9e4554bdb8a..481d211576d 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-avx-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-avx-ld64.c @@ -62,8 +62,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -93,7 +93,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse2-ld128.c index 209686966dc..79224789d19 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse2-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse2-ld128.c @@ -58,8 +58,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -87,7 +87,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse2-ld64.c index c02e4aaf620..112bf4be04f 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse2-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse2-ld64.c @@ -58,8 +58,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -89,7 +89,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse41-ld128.c index ad09731cfc0..766087f6957 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse41-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse41-ld128.c @@ -58,8 +58,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -87,7 +87,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse41-ld64.c index ba768d4682a..5efdcf4027d 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse41-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse41-ld64.c @@ -58,8 +58,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -89,7 +89,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-ssse3-ld128.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-ssse3-ld128.c index 5fa72b07015..5cbdfeb306f 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-ssse3-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-ssse3-ld128.c @@ -58,8 +58,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -87,7 +87,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-ssse3-ld64.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-ssse3-ld64.c index 25ecad64748..c5b085379e9 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-ssse3-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-ssse3-ld64.c @@ -58,8 +58,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -89,7 +89,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-xop-ld128.c index 6f8954331b4..abe82148dd8 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-xop-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-xop-ld128.c @@ -62,8 +62,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -91,7 +91,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-xop-ld64.c index 5c8a5d20fcf..b06164099ad 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-xop-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-xop-ld64.c @@ -62,8 +62,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -93,7 +93,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-avx.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-avx.c index 0d2ef0a555b..f3c2bfa85a4 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-avx.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-avx.c @@ -62,8 +62,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -89,7 +89,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-sse2.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-sse2.c index f250f764b87..9e00c667f13 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-sse2.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-sse2.c @@ -58,8 +58,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -85,7 +85,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-sse41.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-sse41.c index 9d7b258afdc..962d1b614a5 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-sse41.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-sse41.c @@ -58,8 +58,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -85,7 +85,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-ssse3.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-ssse3.c index 85862f4ca25..e0013f81f75 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-ssse3.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-ssse3.c @@ -58,8 +58,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -85,7 +85,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-xop.c b/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-xop.c index 0d54375cd35..cd407f74b19 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-xop.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-xop.c @@ -62,8 +62,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -89,7 +89,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-avx2.c b/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-avx2.c index 236defe3744..990333317a6 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-avx2.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-avx2.c @@ -67,8 +67,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__avx2( __m256i vacc1x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -94,7 +94,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__avx2( vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qs8-gemm/gen/qs8-gemm-2x8c8-xw-minmax-fp32-avx2.c b/src/qs8-gemm/gen/qs8-gemm-2x8c8-xw-minmax-fp32-avx2.c index a73d7139a03..cead6ecb2e9 100644 --- a/src/qs8-gemm/gen/qs8-gemm-2x8c8-xw-minmax-fp32-avx2.c +++ b/src/qs8-gemm/gen/qs8-gemm-2x8c8-xw-minmax-fp32-avx2.c @@ -67,8 +67,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2( __m256i vacc1x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -94,7 +94,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2( vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67)); w = (const int16_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-fp32-avx512vnni.c b/src/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-fp32-avx512vnni.c index 0261de67e34..a3e761515bc 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-fp32-avx512vnni.c @@ -67,13 +67,12 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; + __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); a2 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -88,7 +87,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-avx-ld128.c index fd9c4a354ac..47a1f2ca7f8 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-avx-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-avx-ld128.c @@ -72,8 +72,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -108,7 +108,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-avx-ld64.c index 17d0d27e9d1..af479e67164 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-avx-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-avx-ld64.c @@ -72,8 +72,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -110,7 +110,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse2-ld128.c index b79f134e46b..1b10f2ef43c 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse2-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse2-ld128.c @@ -68,8 +68,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -104,7 +104,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse2-ld64.c index 538a0da85d3..f08ac803f86 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse2-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse2-ld64.c @@ -68,8 +68,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -106,7 +106,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse41-ld128.c index 8ee29fe84c3..8ef09ade31d 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse41-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse41-ld128.c @@ -68,8 +68,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -104,7 +104,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse41-ld64.c index 307d211429c..eb91ad730c9 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse41-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse41-ld64.c @@ -68,8 +68,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -106,7 +106,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-ssse3-ld128.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-ssse3-ld128.c index 5337d879645..654d7bf5d70 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-ssse3-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-ssse3-ld128.c @@ -68,8 +68,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -104,7 +104,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-ssse3-ld64.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-ssse3-ld64.c index 92a2d3436eb..49f3a8b5776 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-ssse3-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-ssse3-ld64.c @@ -68,8 +68,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -106,7 +106,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-xop-ld128.c index 5578263cacb..9828d93b234 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-xop-ld128.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-xop-ld128.c @@ -72,8 +72,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -108,7 +108,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128( vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-xop-ld64.c index 9bc0a265ca7..f20e5cab3f3 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-xop-ld64.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-xop-ld64.c @@ -72,8 +72,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -110,7 +110,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64( vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-avx.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-avx.c index add27a1e4b6..4c2e17f04c5 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-avx.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-avx.c @@ -72,8 +72,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -106,7 +106,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-sse2.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-sse2.c index adadc158b01..b5fabb210ad 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-sse2.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-sse2.c @@ -68,8 +68,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -102,7 +102,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-sse41.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-sse41.c index b27fbb5dd64..9c630a76de5 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-sse41.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-sse41.c @@ -68,8 +68,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -102,7 +102,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-ssse3.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-ssse3.c index 2e72d3d57e2..0e8f473d951 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-ssse3.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-ssse3.c @@ -68,8 +68,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -102,7 +102,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-xop.c b/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-xop.c index 98aa7aef4a8..42fd34f0814 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-xop.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-xop.c @@ -72,8 +72,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -106,7 +106,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop( vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); w = (const int16_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x8c8-minmax-fp32-avx2.c b/src/qs8-gemm/gen/qs8-gemm-3x8c8-minmax-fp32-avx2.c index f8a1c58bb77..29480750dea 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x8c8-minmax-fp32-avx2.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x8c8-minmax-fp32-avx2.c @@ -77,8 +77,8 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2( __m256i vacc2x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -111,7 +111,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2( vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qs8-gemm/gen/qs8-gemm-3x8c8-xw-minmax-fp32-avx2.c b/src/qs8-gemm/gen/qs8-gemm-3x8c8-xw-minmax-fp32-avx2.c index a5c876267df..2aa0c33e1f6 100644 --- a/src/qs8-gemm/gen/qs8-gemm-3x8c8-xw-minmax-fp32-avx2.c +++ b/src/qs8-gemm/gen/qs8-gemm-3x8c8-xw-minmax-fp32-avx2.c @@ -77,8 +77,8 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2( __m256i vacc2x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -111,7 +111,7 @@ void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2( vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67)); w = (const int16_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-avx512vnni.c b/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-avx512vnni.c index 20fc4e81b0d..f5ee11d8160 100644 --- a/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-avx512vnni.c @@ -74,15 +74,14 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); - __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; + __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); a2 += 4; + __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); a3 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -99,7 +98,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qs8-gemm/gen/qs8-gemm-5x16c4-minmax-fp32-avx512vnni.c b/src/qs8-gemm/gen/qs8-gemm-5x16c4-minmax-fp32-avx512vnni.c index d8b3a12239f..e596cf0294b 100644 --- a/src/qs8-gemm/gen/qs8-gemm-5x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-gemm/gen/qs8-gemm-5x16c4-minmax-fp32-avx512vnni.c @@ -81,17 +81,16 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_5x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); - __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); - __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; + __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); a2 += 4; + __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); a3 += 4; + __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); a4 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -110,7 +109,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_5x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qs8-gemm/gen/qs8-gemm-6x16c4-minmax-fp32-avx512vnni.c b/src/qs8-gemm/gen/qs8-gemm-6x16c4-minmax-fp32-avx512vnni.c index 5f200903195..dd6c5732657 100644 --- a/src/qs8-gemm/gen/qs8-gemm-6x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-gemm/gen/qs8-gemm-6x16c4-minmax-fp32-avx512vnni.c @@ -88,19 +88,18 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_6x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); - __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); - __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); - __m512i va5x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a5)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; + __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); a2 += 4; + __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); a3 += 4; + __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); a4 += 4; + __m512i va5x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a5)); a5 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -121,7 +120,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_6x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qs8-gemm/gen/qs8-gemm-7x16c4-minmax-fp32-avx512vnni.c b/src/qs8-gemm/gen/qs8-gemm-7x16c4-minmax-fp32-avx512vnni.c index 7d72e729995..8b9eeaeafaa 100644 --- a/src/qs8-gemm/gen/qs8-gemm-7x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-gemm/gen/qs8-gemm-7x16c4-minmax-fp32-avx512vnni.c @@ -95,21 +95,20 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_7x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); - __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); - __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); - __m512i va5x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a5)); - __m512i va6x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a6)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; + __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); a2 += 4; + __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); a3 += 4; + __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); a4 += 4; + __m512i va5x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a5)); a5 += 4; + __m512i va6x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a6)); a6 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -132,7 +131,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_7x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qs8-gemm/gen/qs8-gemm-8x16c4-minmax-fp32-avx512vnni.c b/src/qs8-gemm/gen/qs8-gemm-8x16c4-minmax-fp32-avx512vnni.c index d84f8107023..7da6a57d83b 100644 --- a/src/qs8-gemm/gen/qs8-gemm-8x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-gemm/gen/qs8-gemm-8x16c4-minmax-fp32-avx512vnni.c @@ -102,23 +102,22 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_8x16c4__avx512vnni( w = (const int32_t*) w + 16; size_t k = kc; - do { + while (k >= 4 * sizeof(int8_t)) { __m512i va0x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a0)); - __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); - __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); - __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); - __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); - __m512i va5x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a5)); - __m512i va6x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a6)); - __m512i va7x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a7)); - a0 += 4; + __m512i va1x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a1)); a1 += 4; + __m512i va2x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a2)); a2 += 4; + __m512i va3x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a3)); a3 += 4; + __m512i va4x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a4)); a4 += 4; + __m512i va5x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a5)); a5 += 4; + __m512i va6x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a6)); a6 += 4; + __m512i va7x0123 = _mm512_set1_epi32((int) unaligned_load_u32(a7)); a7 += 4; va0x0123 = _mm512_xor_epi32(va0x0123, vsign_mask); @@ -143,7 +142,7 @@ void xnn_qs8_gemm_minmax_fp32_ukernel_8x16c4__avx512vnni( w = (const int8_t*) w + 64; k -= 4 * sizeof(int8_t); - } while (k != 0); + } __m512 vscaled0x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc0x0123456789ABCDEF); __m512 vscaled1x0123456789ABCDEF = _mm512_cvtepi32_ps(vacc1x0123456789ABCDEF); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-avx-ld128.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-avx-ld128.c index 3f9cd8dd5dc..dd28426c675 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-avx-ld128.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-avx-ld128.c @@ -52,8 +52,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -74,7 +74,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-avx-ld64.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-avx-ld64.c index 0bc547ee841..e906f5d3dc4 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-avx-ld64.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-avx-ld64.c @@ -52,8 +52,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -76,7 +76,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse2-ld128.c index ac7b6f47f8a..17c901c1b21 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse2-ld128.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse2-ld128.c @@ -48,8 +48,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -70,7 +70,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse2-ld64.c index 9f73bba5028..7684379b482 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse2-ld64.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse2-ld64.c @@ -48,8 +48,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -72,7 +72,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse41-ld128.c index 048a61bb153..b7af1f6299f 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse41-ld128.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse41-ld128.c @@ -48,8 +48,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -70,7 +70,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse41-ld64.c index d7c8c357799..a885008bcef 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse41-ld64.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-sse41-ld64.c @@ -48,8 +48,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -72,7 +72,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-xop-ld128.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-xop-ld128.c index 14bf7a6ff2c..96c18e84c93 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-xop-ld128.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-xop-ld128.c @@ -52,8 +52,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -74,7 +74,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-xop-ld64.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-xop-ld64.c index b2d175379c4..3bf03c0a11f 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-xop-ld64.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c8-minmax-fp32-xop-ld64.c @@ -52,8 +52,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -76,7 +76,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx2.c index ed9fa4ff97a..331bbd9942f 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx2.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx2.c @@ -57,8 +57,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx2( __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -77,7 +77,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx2( vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-xw-minmax-fp32-avx2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-xw-minmax-fp32-avx2.c index 668793a5369..85533959699 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-xw-minmax-fp32-avx2.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-xw-minmax-fp32-avx2.c @@ -57,8 +57,8 @@ void xnn_qs8_qc8w_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2( __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1); w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -77,7 +77,7 @@ void xnn_qs8_qc8w_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2( vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); w = (const int16_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-avx-ld128.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-avx-ld128.c index d23cf592acc..11894435bdd 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-avx-ld128.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-avx-ld128.c @@ -62,8 +62,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -91,7 +91,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-avx-ld64.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-avx-ld64.c index f18b5d7d63e..359e370e75f 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-avx-ld64.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-avx-ld64.c @@ -62,8 +62,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -93,7 +93,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse2-ld128.c index f60cb3fb68a..07993edd150 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse2-ld128.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse2-ld128.c @@ -58,8 +58,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -87,7 +87,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse2-ld64.c index f416285c200..edda011512a 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse2-ld64.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse2-ld64.c @@ -58,8 +58,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -89,7 +89,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse41-ld128.c index bbea03c206c..498ed175a6f 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse41-ld128.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse41-ld128.c @@ -58,8 +58,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -87,7 +87,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse41-ld64.c index 2653b00ae90..0bfb27acd65 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse41-ld64.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-sse41-ld64.c @@ -58,8 +58,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -89,7 +89,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-xop-ld128.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-xop-ld128.c index e2f89b83de3..8f4a0be873f 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-xop-ld128.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-xop-ld128.c @@ -62,8 +62,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -91,7 +91,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-xop-ld64.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-xop-ld64.c index 713a54aeb35..09d63bc0b81 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-xop-ld64.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c8-minmax-fp32-xop-ld64.c @@ -62,8 +62,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -93,7 +93,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avx2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avx2.c index 6db97a6ca39..4b3a140eb60 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avx2.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avx2.c @@ -67,8 +67,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__avx2( __m256i vacc1x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -94,7 +94,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__avx2( vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-xw-minmax-fp32-avx2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-xw-minmax-fp32-avx2.c index 1ede516dbf4..a15764d2397 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-xw-minmax-fp32-avx2.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-xw-minmax-fp32-avx2.c @@ -67,8 +67,8 @@ void xnn_qs8_qc8w_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2( __m256i vacc1x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -94,7 +94,7 @@ void xnn_qs8_qc8w_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2( vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67)); w = (const int16_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-avx-ld128.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-avx-ld128.c index 38348865c78..7c6936e0ca5 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-avx-ld128.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-avx-ld128.c @@ -72,8 +72,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -108,7 +108,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-avx-ld64.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-avx-ld64.c index f60de587b24..c2501c97cc5 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-avx-ld64.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-avx-ld64.c @@ -72,8 +72,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -110,7 +110,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse2-ld128.c index 597b738adcf..07344d47346 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse2-ld128.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse2-ld128.c @@ -68,8 +68,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -104,7 +104,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse2-ld64.c index a5a45b48c2f..f6ced434a05 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse2-ld64.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse2-ld64.c @@ -68,8 +68,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8); a0 += 8; @@ -106,7 +106,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse41-ld128.c index 4dcce411acb..b7dfc225029 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse41-ld128.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse41-ld128.c @@ -68,8 +68,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -104,7 +104,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse41-ld64.c index fc5f7f17b53..c2a64679b47 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse41-ld64.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-sse41-ld64.c @@ -68,8 +68,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -106,7 +106,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-xop-ld128.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-xop-ld128.c index 673907c0d2a..0157a74a882 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-xop-ld128.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-xop-ld128.c @@ -72,8 +72,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -108,7 +108,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128( vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-xop-ld64.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-xop-ld64.c index a69f93fac98..e8a4ea43a4b 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-xop-ld64.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c8-minmax-fp32-xop-ld64.c @@ -72,8 +72,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepi8_epi16(va0); a0 += 8; @@ -110,7 +110,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64( vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); w = (const int8_t*) w + 32; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avx2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avx2.c index 60b92039eec..63d38112011 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avx2.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avx2.c @@ -77,8 +77,8 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__avx2( __m256i vacc2x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -111,7 +111,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__avx2( vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67)); w = (const int8_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-xw-minmax-fp32-avx2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-xw-minmax-fp32-avx2.c index 88f970c04e3..c314c8cadf9 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-xw-minmax-fp32-avx2.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-xw-minmax-fp32-avx2.c @@ -77,8 +77,8 @@ void xnn_qs8_qc8w_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2( __m256i vacc2x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { const __m128i va0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -111,7 +111,7 @@ void xnn_qs8_qc8w_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2( vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67)); w = (const int16_t*) w + 64; - k += 8 * sizeof(int8_t); + k -= 8 * sizeof(int8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-avx-ld128.c b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-avx-ld128.c index bc354a53af5..689ba25d1ad 100644 --- a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-avx-ld128.c +++ b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-avx-ld128.c @@ -52,10 +52,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -74,7 +74,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-avx-ld64.c b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-avx-ld64.c index f9db0013d64..22734d2c7ca 100644 --- a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-avx-ld64.c +++ b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-avx-ld64.c @@ -52,9 +52,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -77,7 +77,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld128.c b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld128.c index 9b684ec8e0d..311f562efad 100644 --- a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld128.c +++ b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld128.c @@ -48,10 +48,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); a0 += 8; @@ -70,7 +70,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld64.c b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld64.c index 631fa656478..660ca205bef 100644 --- a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld64.c +++ b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld64.c @@ -48,10 +48,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); a0 += 8; @@ -74,7 +74,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld128.c b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld128.c index f47a1d21ef9..0b080e34472 100644 --- a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld128.c +++ b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld128.c @@ -48,10 +48,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -70,7 +70,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld64.c b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld64.c index db0a0366b28..2f1f07e0c16 100644 --- a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld64.c +++ b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld64.c @@ -48,9 +48,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -73,7 +73,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64( vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-xop-ld128.c b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-xop-ld128.c index d2a9a0c0a76..85cec7870dd 100644 --- a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-xop-ld128.c +++ b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-xop-ld128.c @@ -52,10 +52,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -74,7 +74,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-xop-ld64.c b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-xop-ld64.c index c4c81e5e2f2..9b763518e66 100644 --- a/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-xop-ld64.c +++ b/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-xop-ld64.c @@ -52,9 +52,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64( __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -77,7 +77,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64( vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-1x8c8-minmax-fp32-avx2.c b/src/qu8-gemm/gen/qu8-gemm-1x8c8-minmax-fp32-avx2.c index 052ff283f9d..05b841e80f6 100644 --- a/src/qu8-gemm/gen/qu8-gemm-1x8c8-minmax-fp32-avx2.c +++ b/src/qu8-gemm/gen/qu8-gemm-1x8c8-minmax-fp32-avx2.c @@ -42,6 +42,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2( const uint8_t* a0 = a; uint8_t* c0 = c; + const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point); do { const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]); const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]); @@ -57,9 +58,8 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2( __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1); w = (const int32_t*) w + 8; - size_t k = 0; - const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -78,7 +78,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2( vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67)); w = (const uint8_t*) w + 64; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-avx-ld128.c b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-avx-ld128.c index 39ec0c31134..89639558be1 100644 --- a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-avx-ld128.c +++ b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-avx-ld128.c @@ -62,10 +62,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -91,7 +91,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-avx-ld64.c b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-avx-ld64.c index 37159d8406e..8ede534faac 100644 --- a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-avx-ld64.c +++ b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-avx-ld64.c @@ -62,9 +62,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -94,7 +94,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse2-ld128.c b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse2-ld128.c index 5a4a759801c..0fb857f8b1c 100644 --- a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse2-ld128.c +++ b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse2-ld128.c @@ -58,10 +58,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); a0 += 8; @@ -87,7 +87,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse2-ld64.c b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse2-ld64.c index 08f622c8c67..5daf63bbc0c 100644 --- a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse2-ld64.c +++ b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse2-ld64.c @@ -58,10 +58,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); a0 += 8; @@ -91,7 +91,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse41-ld128.c b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse41-ld128.c index aa9313de360..181b7efaaf1 100644 --- a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse41-ld128.c +++ b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse41-ld128.c @@ -58,10 +58,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -87,7 +87,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse41-ld64.c b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse41-ld64.c index e36c2695fd1..87b5251ec35 100644 --- a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse41-ld64.c +++ b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse41-ld64.c @@ -58,9 +58,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -90,7 +90,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64( vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-xop-ld128.c b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-xop-ld128.c index 0894ea40ce6..3bb230457fd 100644 --- a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-xop-ld128.c +++ b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-xop-ld128.c @@ -62,10 +62,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -91,7 +91,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-xop-ld64.c b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-xop-ld64.c index dc3576147e8..50205f767e2 100644 --- a/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-xop-ld64.c +++ b/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-xop-ld64.c @@ -62,9 +62,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64( __m128i vacc1x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -94,7 +94,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64( vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-2x8c8-minmax-fp32-avx2.c b/src/qu8-gemm/gen/qu8-gemm-2x8c8-minmax-fp32-avx2.c index 4942cea12c5..6aead76c090 100644 --- a/src/qu8-gemm/gen/qu8-gemm-2x8c8-minmax-fp32-avx2.c +++ b/src/qu8-gemm/gen/qu8-gemm-2x8c8-minmax-fp32-avx2.c @@ -48,6 +48,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2( c1 = c0; } + const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point); do { const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]); const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]); @@ -67,9 +68,8 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2( __m256i vacc1x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -95,7 +95,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2( vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67)); w = (const uint8_t*) w + 64; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23); diff --git a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-avx-ld128.c b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-avx-ld128.c index b6621d7329b..cd0100c6a70 100644 --- a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-avx-ld128.c +++ b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-avx-ld128.c @@ -72,10 +72,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -108,7 +108,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-avx-ld64.c b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-avx-ld64.c index 83ecfed3025..7bc7c8a5ce7 100644 --- a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-avx-ld64.c +++ b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-avx-ld64.c @@ -72,9 +72,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -111,7 +111,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld128.c b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld128.c index d1a5eb52a8e..6c06385635f 100644 --- a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld128.c +++ b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld128.c @@ -68,10 +68,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); a0 += 8; @@ -104,7 +104,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld64.c b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld64.c index 0515181b6dc..f475da1ae37 100644 --- a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld64.c +++ b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld64.c @@ -68,10 +68,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); a0 += 8; @@ -108,7 +108,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2)); diff --git a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld128.c b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld128.c index 6385f3b332d..425f4f9a65e 100644 --- a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld128.c +++ b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld128.c @@ -68,10 +68,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -104,7 +104,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld64.c b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld64.c index d3df154b43b..78b382d5c91 100644 --- a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld64.c +++ b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld64.c @@ -68,9 +68,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -107,7 +107,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64( vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-xop-ld128.c b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-xop-ld128.c index 3f660b6041f..69e63f6bf37 100644 --- a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-xop-ld128.c +++ b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-xop-ld128.c @@ -72,10 +72,10 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); const __m128i vzero = _mm_setzero_si128(); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -108,7 +108,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128( vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-xop-ld64.c b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-xop-ld64.c index 5efe4f8453d..2df22a30de4 100644 --- a/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-xop-ld64.c +++ b/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-xop-ld64.c @@ -72,9 +72,9 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64( __m128i vacc2x3 = vacc0x3; w = (const int32_t*) w + 4; - size_t k = 0; const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0); const __m128i vxa0 = _mm_cvtepu8_epi16(va0); a0 += 8; @@ -111,7 +111,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64( vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); w = (const uint8_t*) w + 32; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1); diff --git a/src/qu8-gemm/gen/qu8-gemm-3x8c8-minmax-fp32-avx2.c b/src/qu8-gemm/gen/qu8-gemm-3x8c8-minmax-fp32-avx2.c index a0ad3a3d371..c43c4ad4dfc 100644 --- a/src/qu8-gemm/gen/qu8-gemm-3x8c8-minmax-fp32-avx2.c +++ b/src/qu8-gemm/gen/qu8-gemm-3x8c8-minmax-fp32-avx2.c @@ -54,6 +54,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2( c2 = c1; } + const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point); do { const __m128i vbias0x0 = _mm_cvtsi32_si128(((const int*) w)[0]); const __m128i vbias0x1 = _mm_cvtsi32_si128(((const int*) w)[1]); @@ -77,9 +78,8 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2( __m256i vacc2x67 = vacc0x67; w = (const int32_t*) w + 8; - size_t k = 0; - const __m256i vb_zero_point = _mm256_load_si256((const __m256i*) params->fp32_avx2.kernel_zero_point); - while (k < kc) { + size_t k = kc; + while (k >= 8 * sizeof(uint8_t)) { const __m128i va0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) a0)); const __m256i vxa0 = _mm256_inserti128_si256(_mm256_castsi128_si256(va0), va0, 1); a0 += 8; @@ -112,7 +112,7 @@ void xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2( vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67)); w = (const uint8_t*) w + 64; - k += 8 * sizeof(uint8_t); + k -= 8 * sizeof(uint8_t); } const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);