From ce926fe879b9788607126e94be27cca1810a014f Mon Sep 17 00:00:00 2001 From: Dan Johansson Date: Thu, 8 Aug 2024 13:52:59 +0200 Subject: [PATCH 1/3] ggml: Added run-time detection of neon, i8mm and sve Adds run-time detection of the Arm instructions set features neon, i8mm and sve for Linux and Apple build targets. --- ggml/include/ggml.h | 3 ++ ggml/src/ggml-aarch64.c | 13 +----- ggml/src/ggml-quants.c | 4 +- ggml/src/ggml-quants.h | 4 -- ggml/src/ggml.c | 98 +++++++++++++++++++++++++++++++++++------ 5 files changed, 91 insertions(+), 31 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e24b8a319fc50..9f96e0c489b38 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2507,6 +2507,9 @@ extern "C" { GGML_API int ggml_cpu_has_cann (void); GGML_API int ggml_cpu_has_llamafile (void); + // get the sve vector length in bytes + GGML_API int ggml_cpu_get_sve_cnt(void); + // // Internal types and functions exposed for tests and benchmarks // diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index 8912de63d9252..b27f411474f4c 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -598,15 +598,6 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_ return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8); } -// Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported. -static int sve_lane_count(void) { -#if defined(__ARM_FEATURE_SVE) - return ggml_sve_cnt_b; -#else - return 0; -#endif -} - void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; @@ -843,7 +834,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) #if defined(__ARM_FEATURE_SVE) - if (ggml_cpu_has_sve() && sve_lane_count() == QK8_0) { + if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -2020,7 +2011,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && sve_lane_count() == QK8_0) { + if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 8bffce860a1eb..7aa6dce8907f5 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4013,7 +4013,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f); - const int vector_length = ggml_sve_cnt_b*8; + const int vector_length = ggml_cpu_get_sve_cnt()*8; // VLA Implementation using switch case switch (vector_length) { @@ -5597,7 +5597,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f); - const int vector_length = ggml_sve_cnt_b*8; + const int vector_length = ggml_cpu_get_sve_cnt()*8; //VLA Implemenation for SVE switch (vector_length) { diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index e96ce2b5e5c4e..df9c4b24ae74f 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -142,10 +142,6 @@ void iq2xs_free_impl(enum ggml_type type); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); -#if defined(__ARM_FEATURE_SVE) -extern int ggml_sve_cnt_b; -#endif - #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4b782b0c13550..bb2911139fbf2 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -39,9 +39,15 @@ #include #endif -#if defined(__ARM_FEATURE_SVE) -int ggml_sve_cnt_b = 0; +#if defined(__aarch64__) +struct ggml_aarch64_features_type { + int has_neon; + int has_i8mm; + int has_sve; + int sve_cnt; +} ggml_aarch64_features = {-1, -1, -1, 0}; #endif + #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif @@ -3673,6 +3679,65 @@ static inline int ggml_up(int n, int m) { //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) + +#if defined(__linux__) +#include +#elif defined(__APPLE__) +#include +#endif + +static void ggml_init_aarch64_features(void) { +#if defined(__linux__) + uint32_t hwcap = getauxval(AT_HWCAP); + uint32_t hwcap2 = getauxval(AT_HWCAP2); + + ggml_aarch64_features.has_neon = !!(hwcap & HWCAP_ASIMD); + ggml_aarch64_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); + ggml_aarch64_features.has_sve = !!(hwcap & HWCAP_SVE); +#if defined(__ARM_FEATURE_SVE) + ggml_aarch64_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); +#endif +#elif defined(__APPLE__) + int oldp = 0; + size_t size = sizeof(oldp); + if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) { + oldp = 0; + } + ggml_aarch64_features.has_neon = oldp; + + if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) { + oldp = 0; + } + ggml_aarch64_features.has_i8mm = oldp; + + ggml_aarch64_features.has_sve = 0; + ggml_aarch64_features.sve_cnt = 0; +#else +// Run-time CPU feature detection not implemented for this platform, fallback to compile time +#if defined(__ARM_NEON) + ggml_aarch64_features.has_neon = 1; +#else + ggml_aarch64_features.has_neon = 0; +#endif + +#if defined(__ARM_FEATURE_MATMUL_INT8) + ggml_aarch64_features.has_i8mm = 1; +#else + ggml_aarch64_features.has_i8mm = 0; +#endif + +#if defined(__ARM_FEATURE_SVE) + ggml_aarch64_features.has_sve = 1; + ggml_aarch64_features.sve_cnt = 16; +#else + ggml_aarch64_features.has_sve = 0; + ggml_aarch64_features.sve_cnt = 0; +#endif +#endif +} +#endif + struct ggml_context * ggml_init(struct ggml_init_params params) { // make this function thread safe ggml_critical_section_start(); @@ -3723,6 +3788,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } +#if defined(__aarch64__) + ggml_init_aarch64_features(); +#endif + is_first_call = false; } @@ -3771,12 +3840,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_ASSERT_ALIGNED(ctx->mem_buffer); -#if defined(__ARM_FEATURE_SVE) - if (!ggml_sve_cnt_b) { - ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); - } -#endif - GGML_PRINT_DEBUG("%s: context initialized\n", __func__); ggml_critical_section_end(); @@ -23578,16 +23641,16 @@ int ggml_cpu_has_fma(void) { } int ggml_cpu_has_neon(void) { -#if defined(__ARM_NEON) - return 1; +#if defined(__aarch64__) + return ggml_aarch64_features.has_neon; #else return 0; #endif } int ggml_cpu_has_sve(void) { -#if defined(__ARM_FEATURE_SVE) - return 1; +#if defined(__aarch64__) + return ggml_aarch64_features.has_sve; #else return 0; #endif @@ -23734,11 +23797,18 @@ int ggml_cpu_has_vsx(void) { } int ggml_cpu_has_matmul_int8(void) { -#if defined(__ARM_FEATURE_MATMUL_INT8) - return 1; +#if defined(__aarch64__) + return ggml_aarch64_features.has_i8mm; #else return 0; #endif } +int ggml_cpu_get_sve_cnt(void) { +#if defined(__aarch64__) + return ggml_aarch64_features.sve_cnt; +#else + return 0; +#endif +} //////////////////////////////////////////////////////////////////////////////// From 8fd848dd30f5e370aabfad0061eced194373a4ce Mon Sep 17 00:00:00 2001 From: Dan Johansson Date: Thu, 19 Sep 2024 12:45:11 +0200 Subject: [PATCH 2/3] ggml: Extend feature detection to include non aarch64 Arm arch --- ggml/src/ggml.c | 67 +++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index bb2911139fbf2..6eabf3aa8e906 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -39,13 +39,13 @@ #include #endif -#if defined(__aarch64__) -struct ggml_aarch64_features_type { +#if defined(__ARM_ARCH) +struct ggml_arm_arch_features_type { int has_neon; int has_i8mm; int has_sve; int sve_cnt; -} ggml_aarch64_features = {-1, -1, -1, 0}; +} ggml_arm_arch_features = {-1, -1, -1, 0}; #endif #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) @@ -3679,24 +3679,25 @@ static inline int ggml_up(int n, int m) { //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) +#if defined(__ARM_ARCH) -#if defined(__linux__) +#if defined(__linux__) && defined(__aarch64__) #include #elif defined(__APPLE__) #include #endif -static void ggml_init_aarch64_features(void) { -#if defined(__linux__) +static void ggml_init_arm_arch_features(void) { +#if defined(__linux__) && defined(__aarch64__) uint32_t hwcap = getauxval(AT_HWCAP); uint32_t hwcap2 = getauxval(AT_HWCAP2); - ggml_aarch64_features.has_neon = !!(hwcap & HWCAP_ASIMD); - ggml_aarch64_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); - ggml_aarch64_features.has_sve = !!(hwcap & HWCAP_SVE); + ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD); + ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); + ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE); + #if defined(__ARM_FEATURE_SVE) - ggml_aarch64_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); + ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); #endif #elif defined(__APPLE__) int oldp = 0; @@ -3704,35 +3705,35 @@ static void ggml_init_aarch64_features(void) { if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) { oldp = 0; } - ggml_aarch64_features.has_neon = oldp; + ggml_arm_arch_features.has_neon = oldp; if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) { oldp = 0; } - ggml_aarch64_features.has_i8mm = oldp; + ggml_arm_arch_features.has_i8mm = oldp; - ggml_aarch64_features.has_sve = 0; - ggml_aarch64_features.sve_cnt = 0; + ggml_arm_arch_features.has_sve = 0; + ggml_arm_arch_features.sve_cnt = 0; #else // Run-time CPU feature detection not implemented for this platform, fallback to compile time #if defined(__ARM_NEON) - ggml_aarch64_features.has_neon = 1; + ggml_arm_arch_features.has_neon = 1; #else - ggml_aarch64_features.has_neon = 0; + ggml_arm_arch_features.has_neon = 0; #endif #if defined(__ARM_FEATURE_MATMUL_INT8) - ggml_aarch64_features.has_i8mm = 1; + ggml_arm_arch_features.has_i8mm = 1; #else - ggml_aarch64_features.has_i8mm = 0; + ggml_arm_arch_features.has_i8mm = 0; #endif #if defined(__ARM_FEATURE_SVE) - ggml_aarch64_features.has_sve = 1; - ggml_aarch64_features.sve_cnt = 16; + ggml_arm_arch_features.has_sve = 1; + ggml_arm_arch_features.sve_cnt = 16; #else - ggml_aarch64_features.has_sve = 0; - ggml_aarch64_features.sve_cnt = 0; + ggml_arm_arch_features.has_sve = 0; + ggml_arm_arch_features.sve_cnt = 0; #endif #endif } @@ -3788,8 +3789,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } -#if defined(__aarch64__) - ggml_init_aarch64_features(); +#if defined(__ARM_ARCH) + ggml_init_arm_arch_features(); #endif is_first_call = false; @@ -23641,16 +23642,16 @@ int ggml_cpu_has_fma(void) { } int ggml_cpu_has_neon(void) { -#if defined(__aarch64__) - return ggml_aarch64_features.has_neon; +#if defined(__ARM_ARCH) + return ggml_arm_arch_features.has_neon; #else return 0; #endif } int ggml_cpu_has_sve(void) { -#if defined(__aarch64__) - return ggml_aarch64_features.has_sve; +#if defined(__ARM_ARCH) + return ggml_arm_arch_features.has_sve; #else return 0; #endif @@ -23797,16 +23798,16 @@ int ggml_cpu_has_vsx(void) { } int ggml_cpu_has_matmul_int8(void) { -#if defined(__aarch64__) - return ggml_aarch64_features.has_i8mm; +#if defined(__ARM_ARCH) + return ggml_arm_arch_features.has_i8mm; #else return 0; #endif } int ggml_cpu_get_sve_cnt(void) { -#if defined(__aarch64__) - return ggml_aarch64_features.sve_cnt; +#if defined(__ARM_ARCH) + return ggml_arm_arch_features.sve_cnt; #else return 0; #endif From a48284c5f36d7881b24fa1dd1c41e28e0c9da56f Mon Sep 17 00:00:00 2001 From: Dan Johansson Date: Wed, 25 Sep 2024 12:42:57 +0200 Subject: [PATCH 3/3] ggml: Move definition of ggml_arm_arch_features to the global data section --- ggml/src/ggml.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 6eabf3aa8e906..fac4466e31d44 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -39,15 +39,6 @@ #include #endif -#if defined(__ARM_ARCH) -struct ggml_arm_arch_features_type { - int has_neon; - int has_i8mm; - int has_sve; - int sve_cnt; -} ggml_arm_arch_features = {-1, -1, -1, 0}; -#endif - #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif @@ -461,6 +452,15 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; // precomputed f32 table for f16 (256 KB) (ggml-impl.h) float ggml_table_f32_f16[1 << 16]; +#if defined(__ARM_ARCH) +struct ggml_arm_arch_features_type { + int has_neon; + int has_i8mm; + int has_sve; + int sve_cnt; +} ggml_arm_arch_features = {-1, -1, -1, 0}; +#endif + GGML_CALL const char * ggml_status_to_string(enum ggml_status status) { switch (status) { case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";