diff --git a/CMakeLists.txt b/CMakeLists.txt index 47cb137553095c..0ddcead86f0259 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,9 @@ set(LLAMA_FMA OFF) set(LLAMA_F16C OFF) set(CMAKE_CUDA_FLAGS "--verbose") # set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics -set(CUDACXX /usr/local/cuda-12.2/bin/nvcc) +set(CUDACXX /usr/local/cuda-12.3/bin/nvcc) +set(CMAKE_CUDA_COMPILER /usr/local/cuda-12.3/bin/nvcc) +set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda-12.3) #GGML_USE_CUBLAS #set(CMAKE_EXE_LINKER_FLAGS -pg) @@ -128,7 +130,7 @@ option(LLAMA_BUILD_SERVER "llama: build server example" # Compile flags # -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) @@ -420,14 +422,15 @@ endif() if (LLAMA_ALL_WARNINGS) if (NOT MSVC) - set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) + # -Wpedantic + set(warning_flags -Wall -Wextra -Wcast-qual -Wno-unused-function) set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration) - set(cxx_flags -Wmissing-declarations -Wmissing-noreturn) + set(cxx_flags -Wmissing-declarations -Wmissing-noreturn -fpermissive) set(host_cxx_flags "") if (CMAKE_C_COMPILER_ID MATCHES "Clang") set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return) - set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi) + set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi -fpermissive) if ( (CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR @@ -437,13 +440,13 @@ if (LLAMA_ALL_WARNINGS) endif() elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU") set(c_flags ${c_flags} -Wdouble-promotion) - set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds) + set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds -fpermissive) if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0) - set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation) + set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation -fpermissive) endif() if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0) - set(host_cxx_flags ${host_cxx_flags} -Wextra-semi) + set(host_cxx_flags ${host_cxx_flags} -Wextra-semi -fpermissive) endif() endif() else() @@ -451,7 +454,7 @@ if (LLAMA_ALL_WARNINGS) endif() set(c_flags ${c_flags} -save-temps --verbose ${warning_flags}) - set(cxx_flags ${cxx_flags} -save-temps --verbose ${warning_flags}) + set(cxx_flags ${cxx_flags} -fpermissive -save-temps --verbose ${warning_flags}) add_compile_options("$<$:${c_flags}>" "$<$:${cxx_flags}>" "$<$:${host_cxx_flags}>") @@ -677,13 +680,13 @@ if (GGML_USE_CPU_HBM) endif() add_library(ggml OBJECT - ggml.c + ggml.cpp ggml.h - ggml-alloc.c + ggml-alloc.cpp ggml-alloc.h - ggml-backend.c + ggml-backend.cpp ggml-backend.h - ggml-quants.c + ggml-quants.cpp ggml-quants.h ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} diff --git a/ggml-quants.cpp b/ggml-quants.cpp index 13ab722e77477c..a084f66c9c5860 100644 --- a/ggml-quants.cpp +++ b/ggml-quants.cpp @@ -425,7 +425,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif // reference implementation for deterministic creation of model files -void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { +void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -462,11 +462,11 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict } } -void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { +void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k) { quantize_row_q4_0_reference(x, y, k); } -void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { +void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k) { const int qk = QK4_1; assert(k % qk == 0); @@ -503,11 +503,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict } } -void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { +void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k) { quantize_row_q4_1_reference(x, y, k); } -void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { +void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k) { static const int qk = QK5_0; assert(k % qk == 0); @@ -551,11 +551,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict } } -void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { +void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k) { quantize_row_q5_0_reference(x, y, k); } -void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { +void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k) { const int qk = QK5_1; assert(k % qk == 0); @@ -599,12 +599,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict } } -void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { +void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k) { quantize_row_q5_1_reference(x, y, k); } // reference implementation for deterministic creation of model files -void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { +void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k) { assert(k % QK8_0 == 0); const int nb = k / QK8_0; @@ -629,12 +629,12 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict } } -void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ vy, int k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; - block_q8_0 * restrict y = vy; + block_q8_0 * __restrict__ y = vy; #if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { @@ -818,7 +818,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { } // reference implementation for deterministic creation of model files -void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) { +void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k) { assert(QK8_1 == 32); assert(k % QK8_1 == 0); const int nb = k / QK8_1; @@ -853,11 +853,11 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict } } -void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ vy, int k) { assert(k % QK8_1 == 0); const int nb = k / QK8_1; - block_q8_1 * restrict y = vy; + block_q8_1 * __restrict__ y = vy; #if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { @@ -1071,7 +1071,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { #endif } -void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -1091,7 +1091,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int } } -void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) { +void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k) { static const int qk = QK4_1; assert(k % qk == 0); @@ -1112,7 +1112,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int } } -void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k) { static const int qk = QK5_0; assert(k % qk == 0); @@ -1138,7 +1138,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int } } -void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) { +void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k) { static const int qk = QK5_1; assert(k % qk == 0); @@ -1165,7 +1165,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int } } -void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k) { static const int qk = QK8_0; assert(k % qk == 0); @@ -1195,7 +1195,7 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } -static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) { +static float make_qx_quants(int n, int nmax, const float * __restrict__ x, int8_t * __restrict__ L, int rmse_type) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -1259,7 +1259,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * return scale; } -static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) { +static float make_q3_quants(int n, int nmax, const float * __restrict__ x, int8_t * __restrict__ L, bool do_rmse) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -1318,7 +1318,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * return 1/iscale; } -static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, +static float make_qkx1_quants(int n, int nmax, const float * __restrict__ x, uint8_t * __restrict__ L, float * __restrict__ the_min, int ntry, float alpha) { float min = x[0]; float max = x[0]; @@ -1361,8 +1361,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t return scale; } -static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, - uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, +static float make_qkx2_quants(int n, int nmax, const float * __restrict__ x, const float * __restrict__ weights, + uint8_t * __restrict__ L, float * __restrict__ the_min, uint8_t * __restrict__ Laux, float rmin, float rdelta, int nstep, bool use_mad) { float min = x[0]; float max = x[0]; @@ -1443,7 +1443,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f } #if QK_K == 256 -static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { +static inline void get_scale_min_k4(int j, const uint8_t * __restrict__ q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; } else { @@ -1455,7 +1455,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * //========================- 2-bit (de)-quantization -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) { +void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1532,7 +1532,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict } } -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) { +void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1578,15 +1578,15 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int } } -void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ vy, int k) { quantize_row_q2_K_reference(x, vy, k); } -size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q2_K(const float * __restrict__ src, void * __restrict__ dst, int n, int k, int64_t * __restrict__ hist) { (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K; + block_q2_K * __restrict__ y = (block_q2_K *)dst + j/QK_K; quantize_row_q2_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q2_K)); @@ -1594,7 +1594,7 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n //========================= 3-bit (de)-quantization -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) { +void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1708,7 +1708,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } #if QK_K == 256 -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { +void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1722,8 +1722,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; + const uint8_t * __restrict__ q = x[i].qs; + const uint8_t * __restrict__ hm = x[i].hmask; uint8_t m = 1; memcpy(aux, x[i].scales, 12); @@ -1758,7 +1758,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int } } #else -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { +void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k) { assert(k % QK_K == 0); assert(QK_K == 64); const int nb = k / QK_K; @@ -1767,8 +1767,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; + const uint8_t * __restrict__ q = x[i].qs; + const uint8_t * __restrict__ hm = x[i].hmask; const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); @@ -1791,15 +1791,15 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int } #endif -void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ vy, int k) { quantize_row_q3_K_reference(x, vy, k); } -size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q3_K(const float * __restrict__ src, void * __restrict__ dst, int n, int k, int64_t * __restrict__ hist) { (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K; + block_q3_K * __restrict__ y = (block_q3_K *)dst + j/QK_K; quantize_row_q3_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q3_K)); @@ -1807,7 +1807,7 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n // ====================== 4-bit (de)-quantization -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) { +void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1914,7 +1914,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) { +void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1953,18 +1953,18 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int } } -void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ vy, int k) { assert(k % QK_K == 0); - block_q4_K * restrict y = vy; + block_q4_K * __restrict__ y = vy; quantize_row_q4_K_reference(x, y, k); } -size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q4_K(const float * __restrict__ src, void * __restrict__ dst, int n, int k, int64_t * __restrict__ hist) { assert(k % QK_K == 0); (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K; + block_q4_K * __restrict__ y = (block_q4_K *)dst + j/QK_K; quantize_row_q4_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q4_K)); @@ -1972,7 +1972,7 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n // ====================== 5-bit (de)-quantization -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { +void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2042,8 +2042,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; + uint8_t * __restrict__ qh = y[i].qh; + uint8_t * __restrict__ ql = y[i].qs; memset(qh, 0, QK_K/8); uint8_t m1 = 1, m2 = 2; @@ -2090,8 +2090,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; + uint8_t * __restrict__ qh = y[i].qh; + uint8_t * __restrict__ ql = y[i].qs; memset(qh, 0, QK_K/8); for (int j = 0; j < 32; ++j) { @@ -2114,7 +2114,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) { +void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2143,7 +2143,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int } #else float d = GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict s = x[i].scales; + const int8_t * __restrict__ s = x[i].scales; for (int l = 0; l < 8; ++l) { y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); @@ -2159,18 +2159,18 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int } } -void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ vy, int k) { assert(k % QK_K == 0); - block_q5_K * restrict y = vy; + block_q5_K * __restrict__ y = vy; quantize_row_q5_K_reference(x, y, k); } -size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q5_K(const float * __restrict__ src, void * __restrict__ dst, int n, int k, int64_t * __restrict__ hist) { assert(k % QK_K == 0); (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K; + block_q5_K * __restrict__ y = (block_q5_K *)dst + j/QK_K; quantize_row_q5_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q5_K)); @@ -2178,7 +2178,7 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n // ====================== 6-bit (de)-quantization -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) { +void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2228,8 +2228,8 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } } - uint8_t * restrict ql = y[i].ql; - uint8_t * restrict qh = y[i].qh; + uint8_t * __restrict__ ql = y[i].ql; + uint8_t * __restrict__ qh = y[i].qh; #if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { @@ -2260,7 +2260,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } } -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) { +void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2268,9 +2268,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int const float d = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict ql = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict sc = x[i].scales; + const uint8_t * __restrict__ ql = x[i].ql; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ sc = x[i].scales; #if QK_K == 256 for (int n = 0; n < QK_K; n += 128) { @@ -2307,9 +2307,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int } } -void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ vy, int k) { assert(k % QK_K == 0); - block_q6_K * restrict y = vy; + block_q6_K * __restrict__ y = vy; quantize_row_q6_K_reference(x, y, k); } @@ -2318,7 +2318,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K; + block_q6_K * __restrict__ y = (block_q6_K *)dst + j/QK_K; quantize_row_q6_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q6_K)); @@ -2326,7 +2326,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * //===================================== Q8_K ============================================== -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { +void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2363,7 +2363,7 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict } } -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) { +void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2374,7 +2374,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int } } -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) { +void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k) { quantize_row_q8_K_reference(x, y, k); } @@ -2423,15 +2423,15 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { //fprintf(stderr, "%s: n:%d s:%f vx:%p vy:%p\n", __func__, n,*s, vx, vy); const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); - const block_q4_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q4_0 * __restrict__ x = vx; + const block_q8_0 * __restrict__ y = vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -2440,10 +2440,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q4_0 * __restrict__ x0 = &x[i + 0]; + const block_q4_0 * __restrict__ x1 = &x[i + 1]; + const block_q8_0 * __restrict__ y0 = &y[i + 0]; + const block_q8_0 * __restrict__ y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); const int8x16_t s8b = vdupq_n_s8(0x8); @@ -2734,14 +2734,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, #endif } -void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_1_q8_1(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); - const block_q4_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; + const block_q4_1 * __restrict__ x = vx; + const block_q8_1 * __restrict__ y = vy; // TODO: add WASM SIMD #if defined(__ARM_NEON) @@ -2753,10 +2753,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q4_1 * restrict x0 = &x[i + 0]; - const block_q4_1 * restrict x1 = &x[i + 1]; - const block_q8_1 * restrict y0 = &y[i + 0]; - const block_q8_1 * restrict y1 = &y[i + 1]; + const block_q4_1 * __restrict__ x0 = &x[i + 0]; + const block_q4_1 * __restrict__ x1 = &x[i + 1]; + const block_q8_1 * __restrict__ y0 = &y[i + 0]; + const block_q8_1 * __restrict__ y1 = &y[i + 1]; summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s; @@ -2894,15 +2894,15 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_0_q8_0(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_0); - const block_q5_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q5_0 * __restrict__ x = vx; + const block_q8_0 * __restrict__ y = vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -2917,10 +2917,10 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q5_0 * restrict x0 = &x[i]; - const block_q5_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q5_0 * __restrict__ x0 = &x[i]; + const block_q5_0 * __restrict__ x1 = &x[i + 1]; + const block_q8_0 * __restrict__ y0 = &y[i]; + const block_q8_0 * __restrict__ y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -3001,8 +3001,8 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri // TODO: check if unrolling this is better for (int i = 0; i < nb; ++i) { - const block_q5_0 * restrict x0 = &x[i]; - const block_q8_0 * restrict y0 = &y[i]; + const block_q5_0 * __restrict__ x0 = &x[i]; + const block_q8_0 * __restrict__ y0 = &y[i]; const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -3200,15 +3200,15 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_1_q8_1(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_1); - const block_q5_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; + const block_q5_1 * __restrict__ x = vx; + const block_q8_1 * __restrict__ y = vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -3226,10 +3226,10 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q5_1 * restrict x0 = &x[i]; - const block_q5_1 * restrict x1 = &x[i + 1]; - const block_q8_1 * restrict y0 = &y[i]; - const block_q8_1 * restrict y1 = &y[i + 1]; + const block_q5_1 * __restrict__ x0 = &x[i]; + const block_q5_1 * __restrict__ x1 = &x[i + 1]; + const block_q8_1 * __restrict__ y0 = &y[i]; + const block_q8_1 * __restrict__ y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -3315,8 +3315,8 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri // TODO: check if unrolling this is better for (int i = 0; i < nb; ++i) { - const block_q5_1 * restrict x0 = &x[i]; - const block_q8_1 * restrict y0 = &y[i]; + const block_q5_1 * __restrict__ x0 = &x[i]; + const block_q8_1 * __restrict__ y0 = &y[i]; summs += GGML_FP16_TO_FP32(x0->m) * y0->s; @@ -3519,14 +3519,14 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q8_0_q8_0(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); - const block_q8_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q8_0 * __restrict__ x = vx; + const block_q8_0 * __restrict__ y = vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -3535,10 +3535,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q8_0 * restrict x0 = &x[i + 0]; - const block_q8_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q8_0 * __restrict__ x0 = &x[i + 0]; + const block_q8_0 * __restrict__ x1 = &x[i + 1]; + const block_q8_0 * __restrict__ y0 = &y[i + 0]; + const block_q8_0 * __restrict__ y1 = &y[i + 1]; const int8x16_t x0_0 = vld1q_s8(x0->qs); const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); @@ -3643,10 +3643,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri } #if QK_K == 256 -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { - const block_q2_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q2_K * __restrict__ x = vx; + const block_q8_K * __restrict__ y = vy; const int nb = n / QK_K; @@ -3668,9 +3668,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint8_t * restrict sc = x[i].scales; + const uint8_t * __restrict__ q2 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; + const uint8_t * __restrict__ sc = x[i].scales; const uint8x16_t mins_and_scales = vld1q_u8(sc); const uint8x16_t scales = vandq_u8(mins_and_scales, m4); @@ -3747,8 +3747,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q2 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); @@ -3814,8 +3814,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q2 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; // load mins and scales from block_q2_K.scales[QK_K/16] const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -4036,10 +4036,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { - const block_q2_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q2_K * __restrict__ x = vx; + const block_q8_K * __restrict__ y = vy; const int nb = n / QK_K; @@ -4062,9 +4062,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const float dmin = -y[i].d * (float)x[i].dmin; - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint8_t * __restrict__ q2 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; + const uint32_t * __restrict__ sc = (const uint32_t *)x[i].scales; aux32[0] = sc[0] & 0x0f0f0f0f; aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4115,8 +4115,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri __m256 acc = _mm256_setzero_ps(); uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; + const uint8_t * __restrict__ db = (const uint8_t *)&ud; + const uint8_t * __restrict__ mb = (const uint8_t *)&um; float summs = 0; @@ -4127,10 +4127,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q2 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint32_t * __restrict__ sc = (const uint32_t *)x[i].scales; ud = (sc[0] >> 0) & 0x0f0f0f0f; um = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4167,8 +4167,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri __m256 acc = _mm256_setzero_ps(); uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; + const uint8_t * __restrict__ db = (const uint8_t *)&ud; + const uint8_t * __restrict__ mb = (const uint8_t *)&um; float summs = 0; @@ -4179,10 +4179,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q2 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint32_t * __restrict__ sc = (const uint32_t *)x[i].scales; ud = (sc[0] >> 0) & 0x0f0f0f0f; um = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4228,9 +4228,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const float dmin = -y[i].d * (float)x[i].dmin; - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint8_t * __restrict__ q2 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; + const uint32_t * __restrict__ sc = (const uint32_t *)x[i].scales; aux32[0] = sc[0] & 0x0f0f0f0f; aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4312,14 +4312,14 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { assert(n % QK_K == 0); const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q3_K * __restrict__ x = vx; + const block_q8_K * __restrict__ y = vy; const int nb = n / QK_K; @@ -4347,9 +4347,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q3 = x[i].qs; + const uint8_t * __restrict__ qh = x[i].hmask; + const int8_t * __restrict__ q8 = y[i].qs; ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); @@ -4455,8 +4455,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q3 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; // Set up scales memcpy(aux, x[i].scales, 12); @@ -4560,8 +4560,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q3 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; // Set up scales aux = (const uint32_t *)x[i].scales; @@ -4695,9 +4695,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q3 = x[i].qs; + const uint8_t * __restrict__ qh = x[i].hmask; + const int8_t * __restrict__ q8 = y[i].qs; memcpy(aux, x[i].scales, 12); utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); @@ -4807,11 +4807,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q3 = x[i].qs; + const uint8_t * __restrict__ hm = x[i].hmask; + const int8_t * __restrict__ q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * __restrict__ a = aux8; uint8_t m = 1; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; @@ -4856,11 +4856,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { assert(n % QK_K == 0); - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q3_K * __restrict__ x = vx; + const block_q8_K * __restrict__ y = vy; const int nb = n / QK_K; @@ -4948,8 +4948,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q3 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5019,8 +5019,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q3 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5099,8 +5099,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q3 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5174,10 +5174,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; - int8_t * restrict a = aux8; + const uint8_t * __restrict__ q3 = x[i].qs; + const uint8_t * __restrict__ hm = x[i].hmask; + const int8_t * __restrict__ q8 = y[i].qs; + int8_t * __restrict__ a = aux8; for (int l = 0; l < 8; ++l) { a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4); a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4); @@ -5214,11 +5214,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { assert(n % QK_K == 0); - const block_q4_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q4_K * __restrict__ x = vx; + const block_q8_K * __restrict__ y = vy; const int nb = n / QK_K; @@ -5263,8 +5263,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; int32_t sumi1 = 0; int32_t sumi2 = 0; @@ -5335,8 +5335,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri utmp[2] = uaux; utmp[0] &= kmask1; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); @@ -5394,8 +5394,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -5495,8 +5495,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; vl = 32; @@ -5549,10 +5549,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * __restrict__ a = aux8; for (int j = 0; j < QK_K/64; ++j) { for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); a += 32; @@ -5595,11 +5595,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif } #else -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { assert(n % QK_K == 0); - const block_q4_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q4_K * __restrict__ x = vx; + const block_q8_K * __restrict__ y = vy; const int nb = n / QK_K; @@ -5619,14 +5619,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri float sum_mins = 0.f; uint16_t aux16[2]; - const uint8_t * restrict scales = (const uint8_t *)aux16; + const uint8_t * __restrict__ scales = (const uint8_t *)aux16; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; - const uint16_t * restrict a = (const uint16_t *)x[i].scales; + const uint16_t * __restrict__ a = (const uint16_t *)x[i].scales; aux16[0] = a[0] & 0x0f0f; aux16[1] = (a[0] >> 4) & 0x0f0f; @@ -5699,8 +5699,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); const __m256i q4l = _mm256_and_si256(q4bits, m4); @@ -5745,8 +5745,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0); @@ -5779,16 +5779,16 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #elif defined __riscv_v_intrinsic uint16_t s16[2]; - const uint8_t * restrict scales = (const uint8_t *)s16; + const uint8_t * __restrict__ scales = (const uint8_t *)s16; float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; - const uint16_t * restrict b = (const uint16_t *)x[i].scales; + const uint16_t * __restrict__ b = (const uint16_t *)x[i].scales; s16[0] = b[0] & 0x0f0f; s16[1] = (b[0] >> 4) & 0x0f0f; @@ -5828,17 +5828,17 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri memset(sums, 0, 8*sizeof(float)); uint16_t s16[2]; - const uint8_t * restrict scales = (const uint8_t *)s16; + const uint8_t * __restrict__ scales = (const uint8_t *)s16; float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - uint8_t * restrict a = aux8; + const uint8_t * __restrict__ q4 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; + uint8_t * __restrict__ a = aux8; for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF; for (int l = 0; l < 32; ++l) a[l+32] = q4[l] >> 4; - const uint16_t * restrict b = (const uint16_t *)x[i].scales; + const uint16_t * __restrict__ b = (const uint16_t *)x[i].scales; s16[0] = b[0] & 0x0f0f; s16[1] = (b[0] >> 4) & 0x0f0f; @@ -5862,11 +5862,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { assert(n % QK_K == 0); - const block_q5_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q5_K * __restrict__ x = vx; + const block_q8_K * __restrict__ y = vy; const int nb = n / QK_K; @@ -5912,9 +5912,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q5 = x[i].qs; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); @@ -5977,8 +5977,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q5 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; #if QK_K == 256 const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); @@ -6066,8 +6066,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q5 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -6164,9 +6164,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri vl = 8; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q5 = x[i].qs; + const uint8_t * __restrict__ hm = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; @@ -6250,11 +6250,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].qs; + const uint8_t * __restrict__ hm = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * __restrict__ a = aux8; uint8_t m = 1; for (int j = 0; j < QK_K/64; ++j) { for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); @@ -6303,11 +6303,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { assert(n % QK_K == 0); - const block_q5_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q5_K * __restrict__ x = vx; + const block_q8_K * __restrict__ y = vy; const int nb = n / QK_K; @@ -6329,9 +6329,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const int8_t * sc = x[i].scales; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q5 = x[i].qs; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; const uint8x8_t qhbits = vld1_u8(qh); @@ -6388,8 +6388,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q5 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); @@ -6434,8 +6434,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q5 = x[i].qs; + const int8_t * __restrict__ q8 = y[i].qs; const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); @@ -6491,9 +6491,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const int8_t * sc = x[i].scales; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q5 = x[i].qs; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); @@ -6561,10 +6561,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - int8_t * restrict a = aux8; + const uint8_t * __restrict__ q4 = x[i].qs; + const uint8_t * __restrict__ hm = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; + int8_t * __restrict__ a = aux8; for (int l = 0; l < 32; ++l) { a[l+ 0] = q4[l] & 0xF; a[l+32] = q4[l] >> 4; @@ -6575,7 +6575,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri } const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict sc = x[i].scales; + const int8_t * __restrict__ sc = x[i].scales; for (int j = 0; j < QK_K/16; ++j) { const float dl = d * sc[j]; @@ -6592,11 +6592,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #if QK_K == 256 -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { assert(n % QK_K == 0); - const block_q6_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q6_K * __restrict__ x = vx; + const block_q8_K * __restrict__ y = vy; const int nb = n / QK_K; @@ -6619,11 +6619,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q6 = x[i].ql; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * __restrict__ scale = x[i].scales; const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); const int8x16_t scales = vld1q_s8(scale); @@ -6751,9 +6751,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].ql; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -6831,9 +6831,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].ql; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -6943,11 +6943,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q6 = x[i].ql; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * __restrict__ scale = x[i].scales; size_t vl; @@ -7031,11 +7031,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].ql; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * __restrict__ a = aux8; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; @@ -7068,11 +7068,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) { assert(n % QK_K == 0); - const block_q6_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q6_K * __restrict__ x = vx; + const block_q8_K * __restrict__ y = vy; const int nb = n / QK_K; @@ -7095,11 +7095,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d_all = (float)x[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q6 = x[i].ql; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * __restrict__ scale = x[i].scales; int32_t isum = 0; @@ -7158,9 +7158,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].ql; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); @@ -7215,9 +7215,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].ql; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); @@ -7282,11 +7282,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d_all = (float)x[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q6 = x[i].ql; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * __restrict__ scale = x[i].scales; int32_t isum = 0; @@ -7351,11 +7351,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * __restrict__ q4 = x[i].ql; + const uint8_t * __restrict__ qh = x[i].qh; + const int8_t * __restrict__ q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * __restrict__ a = aux8; for (int l = 0; l < 16; ++l) { a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; diff --git a/ggml.cpp b/ggml.cpp index 3f843003b60166..2ccf51fe989b22 100644 --- a/ggml.cpp +++ b/ggml.cpp @@ -423,191 +423,191 @@ static void ggml_vec_dot_f32(const int n, float * __restrict__ s, const float * static void ggml_vec_dot_f16(const int n, float * __restrict__ s, ggml_fp16_t * __restrict__ x, ggml_fp16_t * __restrict__ y); static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { - [GGML_TYPE_I8] = { - .type_name = "i8", - .blck_size = 1, - .type_size = sizeof(int8_t), - .is_quantized = false, - }, - [GGML_TYPE_I16] = { - .type_name = "i16", - .blck_size = 1, - .type_size = sizeof(int16_t), - .is_quantized = false, - }, - [GGML_TYPE_I32] = { - .type_name = "i32", - .blck_size = 1, - .type_size = sizeof(int32_t), - .is_quantized = false, - }, - [GGML_TYPE_F32] = { - .type_name = "f32", - .blck_size = 1, - .type_size = sizeof(float), - .is_quantized = false, - .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, - .vec_dot_type = GGML_TYPE_F32, - }, - [GGML_TYPE_F16] = { - .type_name = "f16", - .blck_size = 1, - .type_size = sizeof(ggml_fp16_t), - .is_quantized = false, - .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, - .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, - .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, - .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, - .vec_dot_type = GGML_TYPE_F16, - }, - [GGML_TYPE_Q4_0] = { - .type_name = "q4_0", - .blck_size = QK4_0, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q4_0, - .from_float = quantize_row_q4_0, - .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, - .vec_dot = ggml_vec_dot_q4_0_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, - }, - [GGML_TYPE_Q4_1] = { - .type_name = "q4_1", - .blck_size = QK4_1, - .type_size = sizeof(block_q4_1), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q4_1, - .from_float = quantize_row_q4_1, - .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, - .vec_dot = ggml_vec_dot_q4_1_q8_1, - .vec_dot_type = GGML_TYPE_Q8_1, - }, - [4] = { // GGML_TYPE_Q4_2 - .type_name = "DEPRECATED", - .blck_size = 0, - .type_size = 0, - .is_quantized = false, - .to_float = NULL, - .from_float = NULL, - .from_float_reference = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_COUNT, - }, - [5] = { // GGML_TYPE_Q4_3 - .type_name = "DEPRECATED", - .blck_size = 0, - .type_size = 0, - .is_quantized = false, - .to_float = NULL, - .from_float = NULL, - .from_float_reference = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_COUNT, - }, - [GGML_TYPE_Q5_0] = { - .type_name = "q5_0", - .blck_size = QK5_0, - .type_size = sizeof(block_q5_0), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q5_0, - .from_float = quantize_row_q5_0, - .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, - .vec_dot = ggml_vec_dot_q5_0_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, - }, - [GGML_TYPE_Q5_1] = { - .type_name = "q5_1", - .blck_size = QK5_1, - .type_size = sizeof(block_q5_1), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q5_1, - .from_float = quantize_row_q5_1, - .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, - .vec_dot = ggml_vec_dot_q5_1_q8_1, - .vec_dot_type = GGML_TYPE_Q8_1, - }, - [GGML_TYPE_Q8_0] = { - .type_name = "q8_0", - .blck_size = QK8_0, - .type_size = sizeof(block_q8_0), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q8_0, - .from_float = quantize_row_q8_0, - .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, - .vec_dot = ggml_vec_dot_q8_0_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, - }, - [GGML_TYPE_Q8_1] = { - .type_name = "q8_1", - .blck_size = QK8_1, - .type_size = sizeof(block_q8_1), - .is_quantized = true, - .from_float = quantize_row_q8_1, - .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, - .vec_dot_type = GGML_TYPE_Q8_1, - }, - [GGML_TYPE_Q2_K] = { - .type_name = "q2_K", - .blck_size = QK_K, - .type_size = sizeof(block_q2_K), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q2_K, - .from_float = quantize_row_q2_K, - .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, - .vec_dot = ggml_vec_dot_q2_K_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q3_K] = { - .type_name = "q3_K", - .blck_size = QK_K, - .type_size = sizeof(block_q3_K), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q3_K, - .from_float = quantize_row_q3_K, - .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, - .vec_dot = ggml_vec_dot_q3_K_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q4_K] = { - .type_name = "q4_K", - .blck_size = QK_K, - .type_size = sizeof(block_q4_K), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q4_K, - .from_float = quantize_row_q4_K, - .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, - .vec_dot = ggml_vec_dot_q4_K_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q5_K] = { - .type_name = "q5_K", - .blck_size = QK_K, - .type_size = sizeof(block_q5_K), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q5_K, - .from_float = quantize_row_q5_K, - .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, - .vec_dot = ggml_vec_dot_q5_K_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q6_K] = { - .type_name = "q6_K", - .blck_size = QK_K, - .type_size = sizeof(block_q6_K), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q6_K, - .from_float = quantize_row_q6_K, - .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, - .vec_dot = ggml_vec_dot_q6_K_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q8_K] = { - .type_name = "q8_K", - .blck_size = QK_K, - .type_size = sizeof(block_q8_K), - .is_quantized = true, - .from_float = quantize_row_q8_K, - } + // [GGML_TYPE_I8] = { + // .type_name = "i8", + // .blck_size = 1, + // .type_size = sizeof(int8_t), + // .is_quantized = false, + // }, + // [GGML_TYPE_I16] = { + // .type_name = "i16", + // .blck_size = 1, + // .type_size = sizeof(int16_t), + // .is_quantized = false, + // }, + // [GGML_TYPE_I32] = { + // .type_name = "i32", + // .blck_size = 1, + // .type_size = sizeof(int32_t), + // .is_quantized = false, + // }, + // [GGML_TYPE_F32] = { + // .type_name = "f32", + // .blck_size = 1, + // .type_size = sizeof(float), + // .is_quantized = false, + // .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, + // .vec_dot_type = GGML_TYPE_F32, + // }, + // [GGML_TYPE_F16] = { + // .type_name = "f16", + // .blck_size = 1, + // .type_size = sizeof(ggml_fp16_t), + // .is_quantized = false, + // .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, + // .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, + // .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, + // .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, + // .vec_dot_type = GGML_TYPE_F16, + // }, + // [GGML_TYPE_Q4_0] = { + // .type_name = "q4_0", + // .blck_size = QK4_0, + // .type_size = sizeof(block_q4_0), + // .is_quantized = true, + // .to_float = (ggml_to_float_t) dequantize_row_q4_0, + // .from_float = quantize_row_q4_0, + // .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, + // .vec_dot = ggml_vec_dot_q4_0_q8_0, + // .vec_dot_type = GGML_TYPE_Q8_0, + // }, + // [GGML_TYPE_Q4_1] = { + // .type_name = "q4_1", + // .blck_size = QK4_1, + // .type_size = sizeof(block_q4_1), + // .is_quantized = true, + // .to_float = (ggml_to_float_t) dequantize_row_q4_1, + // .from_float = quantize_row_q4_1, + // .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, + // .vec_dot = ggml_vec_dot_q4_1_q8_1, + // .vec_dot_type = GGML_TYPE_Q8_1, + // }, + // [4] = { // GGML_TYPE_Q4_2 + // .type_name = "DEPRECATED", + // .blck_size = 0, + // .type_size = 0, + // .is_quantized = false, + // .to_float = NULL, + // .from_float = NULL, + // .from_float_reference = NULL, + // .vec_dot = NULL, + // .vec_dot_type = GGML_TYPE_COUNT, + // }, + // [5] = { // GGML_TYPE_Q4_3 + // .type_name = "DEPRECATED", + // .blck_size = 0, + // .type_size = 0, + // .is_quantized = false, + // .to_float = NULL, + // .from_float = NULL, + // .from_float_reference = NULL, + // .vec_dot = NULL, + // .vec_dot_type = GGML_TYPE_COUNT, + // }, + // [GGML_TYPE_Q5_0] = { + // .type_name = "q5_0", + // .blck_size = QK5_0, + // .type_size = sizeof(block_q5_0), + // .is_quantized = true, + // .to_float = (ggml_to_float_t) dequantize_row_q5_0, + // .from_float = quantize_row_q5_0, + // .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, + // .vec_dot = ggml_vec_dot_q5_0_q8_0, + // .vec_dot_type = GGML_TYPE_Q8_0, + // }, + // [GGML_TYPE_Q5_1] = { + // .type_name = "q5_1", + // .blck_size = QK5_1, + // .type_size = sizeof(block_q5_1), + // .is_quantized = true, + // .to_float = (ggml_to_float_t) dequantize_row_q5_1, + // .from_float = quantize_row_q5_1, + // .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, + // .vec_dot = ggml_vec_dot_q5_1_q8_1, + // .vec_dot_type = GGML_TYPE_Q8_1, + // }, + // [GGML_TYPE_Q8_0] = { + // .type_name = "q8_0", + // .blck_size = QK8_0, + // .type_size = sizeof(block_q8_0), + // .is_quantized = true, + // .to_float = (ggml_to_float_t) dequantize_row_q8_0, + // .from_float = quantize_row_q8_0, + // .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, + // .vec_dot = ggml_vec_dot_q8_0_q8_0, + // .vec_dot_type = GGML_TYPE_Q8_0, + // }, + // [GGML_TYPE_Q8_1] = { + // .type_name = "q8_1", + // .blck_size = QK8_1, + // .type_size = sizeof(block_q8_1), + // .is_quantized = true, + // .from_float = quantize_row_q8_1, + // .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, + // .vec_dot_type = GGML_TYPE_Q8_1, + // }, + // [GGML_TYPE_Q2_K] = { + // .type_name = "q2_K", + // .blck_size = QK_K, + // .type_size = sizeof(block_q2_K), + // .is_quantized = true, + // .to_float = (ggml_to_float_t) dequantize_row_q2_K, + // .from_float = quantize_row_q2_K, + // .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, + // .vec_dot = ggml_vec_dot_q2_K_q8_K, + // .vec_dot_type = GGML_TYPE_Q8_K, + // }, + // [GGML_TYPE_Q3_K] = { + // .type_name = "q3_K", + // .blck_size = QK_K, + // .type_size = sizeof(block_q3_K), + // .is_quantized = true, + // .to_float = (ggml_to_float_t) dequantize_row_q3_K, + // .from_float = quantize_row_q3_K, + // .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, + // .vec_dot = ggml_vec_dot_q3_K_q8_K, + // .vec_dot_type = GGML_TYPE_Q8_K, + // }, + // [GGML_TYPE_Q4_K] = { + // .type_name = "q4_K", + // .blck_size = QK_K, + // .type_size = sizeof(block_q4_K), + // .is_quantized = true, + // .to_float = (ggml_to_float_t) dequantize_row_q4_K, + // .from_float = quantize_row_q4_K, + // .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, + // .vec_dot = ggml_vec_dot_q4_K_q8_K, + // .vec_dot_type = GGML_TYPE_Q8_K, + // }, + // [GGML_TYPE_Q5_K] = { + // .type_name = "q5_K", + // .blck_size = QK_K, + // .type_size = sizeof(block_q5_K), + // .is_quantized = true, + // .to_float = (ggml_to_float_t) dequantize_row_q5_K, + // .from_float = quantize_row_q5_K, + // .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, + // .vec_dot = ggml_vec_dot_q5_K_q8_K, + // .vec_dot_type = GGML_TYPE_Q8_K, + // }, + // [GGML_TYPE_Q6_K] = { + // .type_name = "q6_K", + // .blck_size = QK_K, + // .type_size = sizeof(block_q6_K), + // .is_quantized = true, + // .to_float = (ggml_to_float_t) dequantize_row_q6_K, + // .from_float = quantize_row_q6_K, + // .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, + // .vec_dot = ggml_vec_dot_q6_K_q8_K, + // .vec_dot_type = GGML_TYPE_Q8_K, + // }, + // [GGML_TYPE_Q8_K] = { + // .type_name = "q8_K", + // .blck_size = QK_K, + // .type_size = sizeof(block_q8_K), + // .is_quantized = true, + // .from_float = quantize_row_q8_K, + // } }; // For internal test use @@ -2186,17 +2186,18 @@ static inline int ggml_up(int n, int m) { //////////////////////////////////////////////////////////////////////////////// struct ggml_context * ggml_init(struct ggml_init_params params) { - // make this function thread safe - ggml_critical_section_start(); - - static bool is_first_call = true; - - if (is_first_call) { - // initialize time system (required on Windows) - ggml_time_init(); + struct ggml_context * ctx = NULL; + static bool is_first_call = true; + // make this function thread safe + ggml_critical_section_start(); + - // initialize GELU, Quick GELU, SILU and EXP F32 tables - { + if (is_first_call) { + // initialize time system (required on Windows) + ggml_time_init(); + + // initialize GELU, Quick GELU, SILU and EXP F32 tables + { const uint64_t t_start = ggml_time_us(); UNUSED(t_start); ggml_fp16_t ii; @@ -2219,13 +2220,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { { const uint64_t t_start = ggml_time_us(); UNUSED(t_start); - g_state = (struct ggml_state) { - /*.contexts =*/ { { 0 } }, - /*.numa =*/ { - .n_nodes = 0, - .total_cpus = 0, - }, - }; + // TODOFIXME + // g_state = (struct ggml_state) { + // /*.contexts =*/ { { 0 } }, + // /*.numa =*/ { + // .n_nodes = 0, + // .total_cpus = 0, + // }, + //}; for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) { g_state.contexts[i].used = false; @@ -2248,7 +2250,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { } // find non-used context in g_state - struct ggml_context * ctx = NULL; + for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { if (!g_state.contexts[i].used) { @@ -2412,7 +2414,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml // align to GGML_MEM_ALIGN size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN); - char * const mem_buffer = ctx->mem_buffer; + char * const mem_buffer = (char*)ctx->mem_buffer; struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { @@ -2422,12 +2424,13 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml return NULL; } - *obj_new = (struct ggml_object) { - .offs = cur_end + GGML_OBJECT_SIZE, - .size = size_needed, - .next = NULL, - .type = type, - }; + // FIXME + // *obj_new = (struct ggml_object) { + // .offs = cur_end + GGML_OBJECT_SIZE, + // .size = size_needed, + // .next = NULL, + // .type = type, + // }; ggml_assert_aligned(mem_buffer + obj_new->offs); @@ -17229,7 +17232,7 @@ static enum ggml_opt_result ggml_opt_adam( const int n_accum = MAX(1, params.n_gradient_accumulation); const float accum_norm = 1.0f / (float) n_accum; - float * g = opt->adam.g->data; // gradients + float * g = (float*)opt->adam.g->data; // gradients float * m = opt->adam.m->data; // first moment float * v = opt->adam.v->data; // second moment @@ -17844,36 +17847,39 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { }; } break; case GGML_OPT_LBFGS: - { - result = (struct ggml_opt_params) { - .type = GGML_OPT_LBFGS, - .graph_size = GGML_DEFAULT_GRAPH_SIZE, - .n_threads = 1, - .past = 0, - .delta = 1e-5f, - - .max_no_improvement = 0, - - .print_forward_graph = true, - .print_backward_graph = true, - - .n_gradient_accumulation = 1, - - .lbfgs = { - .m = 6, - .n_iter = 100, - .max_linesearch = 20, - - .eps = 1e-5f, - .ftol = 1e-4f, - .wolfe = 0.9f, - .min_step = 1e-20f, - .max_step = 1e+20f, - - .linesearch = GGML_LINESEARCH_DEFAULT, - }, - }; - } break; + break; + //{ + // TODO FIXME + // result = (struct ggml_opt_params) { + // .type = GGML_OPT_LBFGS, + // .graph_size = GGML_DEFAULT_GRAPH_SIZE, + // .n_threads = 1, + // .past = 0, + // .delta = 1e-5f, + + // .max_no_improvement = 0, + + // .print_forward_graph = true, + // .print_backward_graph = true, + + // .n_gradient_accumulation = 1, + + // .lbfgs = { + // .m = 6, + // .n_iter = 100, + // .max_linesearch = 20, + + // .eps = 1e-5f, + // .ftol = 1e-4f, + // .wolfe = 0.9f, + // .min_step = 1e-20f, + // .max_step = 1e+20f, + + // .linesearch = GGML_LINESEARCH_DEFAULT, + + // } + //}; + //} break; } return result; @@ -18255,36 +18261,36 @@ struct gguf_str { }; static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = { - [GGUF_TYPE_UINT8] = sizeof(uint8_t), - [GGUF_TYPE_INT8] = sizeof(int8_t), - [GGUF_TYPE_UINT16] = sizeof(uint16_t), - [GGUF_TYPE_INT16] = sizeof(int16_t), - [GGUF_TYPE_UINT32] = sizeof(uint32_t), - [GGUF_TYPE_INT32] = sizeof(int32_t), - [GGUF_TYPE_FLOAT32] = sizeof(float), - [GGUF_TYPE_BOOL] = sizeof(bool), - [GGUF_TYPE_STRING] = sizeof(struct gguf_str), - [GGUF_TYPE_UINT64] = sizeof(uint64_t), - [GGUF_TYPE_INT64] = sizeof(int64_t), - [GGUF_TYPE_FLOAT64] = sizeof(double), - [GGUF_TYPE_ARRAY] = 0, // undefined + // [GGUF_TYPE_UINT8] = sizeof(uint8_t), + // [GGUF_TYPE_INT8] = sizeof(int8_t), + // [GGUF_TYPE_UINT16] = sizeof(uint16_t), + // [GGUF_TYPE_INT16] = sizeof(int16_t), + // [GGUF_TYPE_UINT32] = sizeof(uint32_t), + // [GGUF_TYPE_INT32] = sizeof(int32_t), + // [GGUF_TYPE_FLOAT32] = sizeof(float), + // [GGUF_TYPE_BOOL] = sizeof(bool), + // [GGUF_TYPE_STRING] = sizeof(struct gguf_str), + // [GGUF_TYPE_UINT64] = sizeof(uint64_t), + // [GGUF_TYPE_INT64] = sizeof(int64_t), + // [GGUF_TYPE_FLOAT64] = sizeof(double), + // [GGUF_TYPE_ARRAY] = 0, // undefined }; static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { - [GGUF_TYPE_UINT8] = "u8", - [GGUF_TYPE_INT8] = "i8", - [GGUF_TYPE_UINT16] = "u16", - [GGUF_TYPE_INT16] = "i16", - [GGUF_TYPE_UINT32] = "u32", - [GGUF_TYPE_INT32] = "i32", - [GGUF_TYPE_FLOAT32] = "f32", - [GGUF_TYPE_BOOL] = "bool", - [GGUF_TYPE_STRING] = "str", - [GGUF_TYPE_ARRAY] = "arr", - [GGUF_TYPE_UINT64] = "u64", - [GGUF_TYPE_INT64] = "i64", - [GGUF_TYPE_FLOAT64] = "f64", + // [GGUF_TYPE_UINT8] = "u8", + // [GGUF_TYPE_INT8] = "i8", + // [GGUF_TYPE_UINT16] = "u16", + // [GGUF_TYPE_INT16] = "i16", + // [GGUF_TYPE_UINT32] = "u32", + // [GGUF_TYPE_INT32] = "i32", + // [GGUF_TYPE_FLOAT32] = "f32", + // [GGUF_TYPE_BOOL] = "bool", + // [GGUF_TYPE_STRING] = "str", + // [GGUF_TYPE_ARRAY] = "arr", + // [GGUF_TYPE_UINT64] = "u64", + // [GGUF_TYPE_INT64] = "i64", + // [GGUF_TYPE_FLOAT64] = "f64", }; static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); @@ -18366,14 +18372,14 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) { bool ok = true; - ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1); + ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = (char*)calloc(p->n + 1, 1); ok = ok && gguf_fread_el(file, p->data, p->n, offset); return ok; } struct gguf_context * gguf_init_empty(void) { - struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic)); ctx->header.version = GGUF_VERSION; @@ -18418,7 +18424,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p bool ok = true; - struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); // read the header { @@ -18450,7 +18456,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the kv pairs { - ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); + ctx->kv = (gguf_kv*)malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { struct gguf_kv * kv = &ctx->kv[i]; @@ -18525,7 +18531,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the tensor infos { - ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); + ctx->infos = (gguf_tensor_info*)malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; @@ -18645,10 +18651,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // create the tensors for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { const int64_t ne[GGML_MAX_DIMS] = { - ctx->infos[i].ne[0], - ctx->infos[i].ne[1], - ctx->infos[i].ne[2], - ctx->infos[i].ne[3], + (int64_t)ctx->infos[i].ne[0],// FIXME narrowing + (int64_t)ctx->infos[i].ne[1], + (int64_t)ctx->infos[i].ne[2], + (int64_t)ctx->infos[i].ne[3], }; struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne); @@ -18929,7 +18935,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { const int n_kv = gguf_get_n_kv(ctx); - ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)); + ctx->kv = (gguf_kv*)realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)); ctx->kv[n_kv].key.n = strlen(key); ctx->kv[n_kv].key.data = strdup(key); ctx->header.n_kv++; @@ -19065,7 +19071,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { case GGUF_TYPE_ARRAY: { if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { - const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *)); + const char ** data = (const char **)malloc(src->kv[i].value.arr.n*sizeof(char *)); for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; } @@ -19086,7 +19092,7 @@ void gguf_add_tensor( struct gguf_context * ctx, const struct ggml_tensor * tensor) { const int idx = ctx->header.n_tensors; - ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); + ctx->infos = (gguf_tensor_info*)realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); ctx->infos[idx].name.n = strlen(tensor->name); ctx->infos[idx].name.data = strdup(tensor->name);