diff --git a/src/layer/x86/convolution_im2col_gemm_int8.h b/src/layer/x86/convolution_im2col_gemm_int8.h
index 6420c92103c..f18c127fd4f 100644
--- a/src/layer/x86/convolution_im2col_gemm_int8.h
+++ b/src/layer/x86/convolution_im2col_gemm_int8.h
@@ -33,11 +33,11 @@ void unpack_output_tile_int32_avx2(const Mat& topT, Mat& top_blob, int i, int ma
 #endif
 
 // gemm_x86.h
-#if __AVX512F__
+#if NCNN_RUNTIME_CPU && __AVX512F__
 namespace Gemm_x86_avx512_utility {
-#elif __FMA__
+#elif NCNN_RUNTIME_CPU && __FMA__
 namespace Gemm_x86_fma_utility {
-#elif __AVX__
+#elif NCNN_RUNTIME_CPU && __AVX__
 namespace Gemm_x86_avx_utility {
 #else
 namespace Gemm_x86_utility {
@@ -50,11 +50,11 @@ static void convolution_im2col_pack_A_tile_int8(const Mat& A, Mat& AT, int i, in
 {
     // A = (pa, maxk, inch/pa), outch
 
-#if __AVX512F__
+#if NCNN_RUNTIME_CPU && __AVX512F__
     Gemm_x86_avx512_utility::pack_A_tile_int8(A, AT, i, max_ii, k, max_kk);
-#elif __FMA__
+#elif NCNN_RUNTIME_CPU && __FMA__
     Gemm_x86_fma_utility::pack_A_tile_int8(A, AT, i, max_ii, k, max_kk);
-#elif __AVX__
+#elif NCNN_RUNTIME_CPU && __AVX__
     Gemm_x86_avx_utility::pack_A_tile_int8(A, AT, i, max_ii, k, max_kk);
 #else
     Gemm_x86_utility::pack_A_tile_int8(A, AT, i, max_ii, k, max_kk);
@@ -65,11 +65,11 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
 {
     // NCNN_LOGE("convolution_gemm_transB_packed_tile_int8 %d %d %d %d %d %d", i, max_ii, j, max_jj, k, max_kk);
 
-#if __AVX512F__
+#if NCNN_RUNTIME_CPU && __AVX512F__
     Gemm_x86_avx512_utility::gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, i, max_ii, j, max_jj, k, max_kk);
-#elif __FMA__
+#elif NCNN_RUNTIME_CPU && __FMA__
     Gemm_x86_fma_utility::gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, i, max_ii, j, max_jj, k, max_kk);
-#elif __AVX__
+#elif NCNN_RUNTIME_CPU && __AVX__
     Gemm_x86_avx_utility::gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, i, max_ii, j, max_jj, k, max_kk);
 #else
     Gemm_x86_utility::gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, i, max_ii, j, max_jj, k, max_kk);
@@ -1604,22 +1604,22 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
                     _mm_storeu_si128((__m128i*)offsets2, _vindex2);
                     _mm_storeu_si128((__m128i*)offsets3, _vindex3);
 
-                    pp[0] = offsets0[0];
-                    pp[1] = offsets2[0];
-                    pp[2] = offsets0[1];
-                    pp[3] = offsets2[1];
-                    pp[4] = offsets0[2];
-                    pp[5] = offsets2[2];
-                    pp[6] = offsets0[3];
-                    pp[7] = offsets2[3];
-                    pp[8] = offsets1[0];
-                    pp[9] = offsets3[0];
-                    pp[10] = offsets1[1];
-                    pp[11] = offsets3[1];
-                    pp[12] = offsets1[2];
-                    pp[13] = offsets3[2];
-                    pp[14] = offsets1[3];
-                    pp[15] = offsets3[3];
+                    pp[0] = ((const signed char*)bottom_blob)[offsets0[0]];
+                    pp[1] = ((const signed char*)bottom_blob)[offsets2[0]];
+                    pp[2] = ((const signed char*)bottom_blob)[offsets0[1]];
+                    pp[3] = ((const signed char*)bottom_blob)[offsets2[1]];
+                    pp[4] = ((const signed char*)bottom_blob)[offsets0[2]];
+                    pp[5] = ((const signed char*)bottom_blob)[offsets2[2]];
+                    pp[6] = ((const signed char*)bottom_blob)[offsets0[3]];
+                    pp[7] = ((const signed char*)bottom_blob)[offsets2[3]];
+                    pp[8] = ((const signed char*)bottom_blob)[offsets1[0]];
+                    pp[9] = ((const signed char*)bottom_blob)[offsets3[0]];
+                    pp[10] = ((const signed char*)bottom_blob)[offsets1[1]];
+                    pp[11] = ((const signed char*)bottom_blob)[offsets3[1]];
+                    pp[12] = ((const signed char*)bottom_blob)[offsets1[2]];
+                    pp[13] = ((const signed char*)bottom_blob)[offsets3[2]];
+                    pp[14] = ((const signed char*)bottom_blob)[offsets1[3]];
+                    pp[15] = ((const signed char*)bottom_blob)[offsets3[3]];
 
 #endif // __AVX2__
 
@@ -1651,14 +1651,14 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
                     _mm_storeu_si128((__m128i*)offsets0, _vindex0);
                     _mm_storeu_si128((__m128i*)offsets1, _vindex1);
 
-                    pp[0] = offsets0[0];
-                    pp[1] = offsets0[1];
-                    pp[2] = offsets0[2];
-                    pp[3] = offsets0[3];
-                    pp[4] = offsets1[0];
-                    pp[5] = offsets1[1];
-                    pp[6] = offsets1[2];
-                    pp[7] = offsets1[3];
+                    pp[0] = ((const signed char*)bottom_blob)[offsets0[0]];
+                    pp[1] = ((const signed char*)bottom_blob)[offsets0[1]];
+                    pp[2] = ((const signed char*)bottom_blob)[offsets0[2]];
+                    pp[3] = ((const signed char*)bottom_blob)[offsets0[3]];
+                    pp[4] = ((const signed char*)bottom_blob)[offsets1[0]];
+                    pp[5] = ((const signed char*)bottom_blob)[offsets1[1]];
+                    pp[6] = ((const signed char*)bottom_blob)[offsets1[2]];
+                    pp[7] = ((const signed char*)bottom_blob)[offsets1[3]];
 
 #endif // __AVX2__
 
diff --git a/src/layer/x86/gemm_x86.h b/src/layer/x86/gemm_x86.h
index 603a36a78cc..a7539e9cee6 100644
--- a/src/layer/x86/gemm_x86.h
+++ b/src/layer/x86/gemm_x86.h
@@ -41,7 +41,7 @@ class Gemm_x86 : public Gemm
     Mat CT_data;
 };
 
-// expose some gemm internal routines
+// expose some gemm internal routines for convolution uses
 namespace Gemm_x86_utility {
 #if NCNN_INT8
 void pack_A_tile_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk);