From ce926fe879b9788607126e94be27cca1810a014f Mon Sep 17 00:00:00 2001
From: Dan Johansson <dan.johansson@arm.com>
Date: Thu, 8 Aug 2024 13:52:59 +0200
Subject: [PATCH 1/3] ggml: Added run-time detection of neon, i8mm and sve

Adds run-time detection of the Arm instructions set features
neon, i8mm and sve for Linux and Apple build targets.
---
 ggml/include/ggml.h     |  3 ++
 ggml/src/ggml-aarch64.c | 13 +-----
 ggml/src/ggml-quants.c  |  4 +-
 ggml/src/ggml-quants.h  |  4 --
 ggml/src/ggml.c         | 98 +++++++++++++++++++++++++++++++++++------
 5 files changed, 91 insertions(+), 31 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index e24b8a319fc50..9f96e0c489b38 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2507,6 +2507,9 @@ extern "C" {
     GGML_API int ggml_cpu_has_cann       (void);
     GGML_API int ggml_cpu_has_llamafile  (void);
 
+    // get the sve vector length in bytes
+    GGML_API int ggml_cpu_get_sve_cnt(void);
+
     //
     // Internal types and functions exposed for tests and benchmarks
     //
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 8912de63d9252..b27f411474f4c 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -598,15 +598,6 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_
     return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
 }
 
-// Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported.
-static int sve_lane_count(void) {
-#if defined(__ARM_FEATURE_SVE)
-    return ggml_sve_cnt_b;
-#else
-    return 0;
-#endif
-}
-
 void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
     const int qk = QK8_0;
     const int nb = n / qk;
@@ -843,7 +834,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
 
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_cpu_has_sve() && sve_lane_count() == QK8_0) {
+    if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -2020,7 +2011,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
 
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && sve_lane_count() == QK8_0) {
+    if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 8bffce860a1eb..7aa6dce8907f5 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -4013,7 +4013,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     svfloat32_t sumv0 = svdup_n_f32(0.0f);
     svfloat32_t sumv1 = svdup_n_f32(0.0f);
 
-    const int vector_length = ggml_sve_cnt_b*8;
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
 
     // VLA Implementation using switch case
     switch (vector_length) {
@@ -5597,7 +5597,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     svfloat32_t sumv0 = svdup_n_f32(0.0f);
     svfloat32_t sumv1 = svdup_n_f32(0.0f);
 
-    const int vector_length = ggml_sve_cnt_b*8;
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
 
     //VLA Implemenation for SVE
     switch (vector_length) {
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index e96ce2b5e5c4e..df9c4b24ae74f 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -142,10 +142,6 @@ void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
 
-#if defined(__ARM_FEATURE_SVE)
-extern int ggml_sve_cnt_b;
-#endif
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 4b782b0c13550..bb2911139fbf2 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -39,9 +39,15 @@
 #include <unistd.h>
 #endif
 
-#if defined(__ARM_FEATURE_SVE)
-int ggml_sve_cnt_b = 0;
+#if defined(__aarch64__)
+struct ggml_aarch64_features_type {
+    int has_neon;
+    int has_i8mm;
+    int has_sve;
+    int sve_cnt;
+} ggml_aarch64_features = {-1, -1, -1, 0};
 #endif
+
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
@@ -3673,6 +3679,65 @@ static inline int ggml_up(int n, int m) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+#if defined(__aarch64__)
+
+#if defined(__linux__)
+#include <sys/auxv.h>
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+static void ggml_init_aarch64_features(void) {
+#if defined(__linux__)
+    uint32_t hwcap = getauxval(AT_HWCAP);
+    uint32_t hwcap2 = getauxval(AT_HWCAP2);
+
+    ggml_aarch64_features.has_neon = !!(hwcap & HWCAP_ASIMD);
+    ggml_aarch64_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
+    ggml_aarch64_features.has_sve  = !!(hwcap & HWCAP_SVE);
+#if defined(__ARM_FEATURE_SVE)
+    ggml_aarch64_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+#endif
+#elif defined(__APPLE__)
+    int oldp = 0;
+    size_t size = sizeof(oldp);
+    if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    ggml_aarch64_features.has_neon = oldp;
+
+    if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    ggml_aarch64_features.has_i8mm = oldp;
+
+    ggml_aarch64_features.has_sve = 0;
+    ggml_aarch64_features.sve_cnt = 0;
+#else
+// Run-time CPU feature detection not implemented for this platform, fallback to compile time
+#if defined(__ARM_NEON)
+    ggml_aarch64_features.has_neon = 1;
+#else
+    ggml_aarch64_features.has_neon = 0;
+#endif
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    ggml_aarch64_features.has_i8mm = 1;
+#else
+    ggml_aarch64_features.has_i8mm = 0;
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+    ggml_aarch64_features.has_sve = 1;
+    ggml_aarch64_features.sve_cnt = 16;
+#else
+    ggml_aarch64_features.has_sve = 0;
+    ggml_aarch64_features.sve_cnt = 0;
+#endif
+#endif
+}
+#endif
+
 struct ggml_context * ggml_init(struct ggml_init_params params) {
     // make this function thread safe
     ggml_critical_section_start();
@@ -3723,6 +3788,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
             GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
         }
 
+#if defined(__aarch64__)
+        ggml_init_aarch64_features();
+#endif
+
         is_first_call = false;
     }
 
@@ -3771,12 +3840,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
     GGML_ASSERT_ALIGNED(ctx->mem_buffer);
 
-#if defined(__ARM_FEATURE_SVE)
-    if (!ggml_sve_cnt_b) {
-        ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
-    }
-#endif
-
     GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
 
     ggml_critical_section_end();
@@ -23578,16 +23641,16 @@ int ggml_cpu_has_fma(void) {
 }
 
 int ggml_cpu_has_neon(void) {
-#if defined(__ARM_NEON)
-    return 1;
+#if defined(__aarch64__)
+    return ggml_aarch64_features.has_neon;
 #else
     return 0;
 #endif
 }
 
 int ggml_cpu_has_sve(void) {
-#if defined(__ARM_FEATURE_SVE)
-    return 1;
+#if defined(__aarch64__)
+    return ggml_aarch64_features.has_sve;
 #else
     return 0;
 #endif
@@ -23734,11 +23797,18 @@ int ggml_cpu_has_vsx(void) {
 }
 
 int ggml_cpu_has_matmul_int8(void) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    return 1;
+#if defined(__aarch64__)
+    return ggml_aarch64_features.has_i8mm;
 #else
     return 0;
 #endif
 }
 
+int ggml_cpu_get_sve_cnt(void) {
+#if defined(__aarch64__)
+    return ggml_aarch64_features.sve_cnt;
+#else
+    return 0;
+#endif
+}
 ////////////////////////////////////////////////////////////////////////////////

From 8fd848dd30f5e370aabfad0061eced194373a4ce Mon Sep 17 00:00:00 2001
From: Dan Johansson <dan.johansson@arm.com>
Date: Thu, 19 Sep 2024 12:45:11 +0200
Subject: [PATCH 2/3] ggml: Extend feature detection to include non aarch64 Arm
 arch

---
 ggml/src/ggml.c | 67 +++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index bb2911139fbf2..6eabf3aa8e906 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -39,13 +39,13 @@
 #include <unistd.h>
 #endif
 
-#if defined(__aarch64__)
-struct ggml_aarch64_features_type {
+#if defined(__ARM_ARCH)
+struct ggml_arm_arch_features_type {
     int has_neon;
     int has_i8mm;
     int has_sve;
     int sve_cnt;
-} ggml_aarch64_features = {-1, -1, -1, 0};
+} ggml_arm_arch_features = {-1, -1, -1, 0};
 #endif
 
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
@@ -3679,24 +3679,25 @@ static inline int ggml_up(int n, int m) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__aarch64__)
+#if defined(__ARM_ARCH)
 
-#if defined(__linux__)
+#if defined(__linux__) && defined(__aarch64__)
 #include <sys/auxv.h>
 #elif defined(__APPLE__)
 #include <sys/sysctl.h>
 #endif
 
-static void ggml_init_aarch64_features(void) {
-#if defined(__linux__)
+static void ggml_init_arm_arch_features(void) {
+#if defined(__linux__) && defined(__aarch64__)
     uint32_t hwcap = getauxval(AT_HWCAP);
     uint32_t hwcap2 = getauxval(AT_HWCAP2);
 
-    ggml_aarch64_features.has_neon = !!(hwcap & HWCAP_ASIMD);
-    ggml_aarch64_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
-    ggml_aarch64_features.has_sve  = !!(hwcap & HWCAP_SVE);
+    ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
+    ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
+    ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);
+
 #if defined(__ARM_FEATURE_SVE)
-    ggml_aarch64_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+    ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
 #endif
 #elif defined(__APPLE__)
     int oldp = 0;
@@ -3704,35 +3705,35 @@ static void ggml_init_aarch64_features(void) {
     if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
         oldp = 0;
     }
-    ggml_aarch64_features.has_neon = oldp;
+    ggml_arm_arch_features.has_neon = oldp;
 
     if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
         oldp = 0;
     }
-    ggml_aarch64_features.has_i8mm = oldp;
+    ggml_arm_arch_features.has_i8mm = oldp;
 
-    ggml_aarch64_features.has_sve = 0;
-    ggml_aarch64_features.sve_cnt = 0;
+    ggml_arm_arch_features.has_sve = 0;
+    ggml_arm_arch_features.sve_cnt = 0;
 #else
 // Run-time CPU feature detection not implemented for this platform, fallback to compile time
 #if defined(__ARM_NEON)
-    ggml_aarch64_features.has_neon = 1;
+    ggml_arm_arch_features.has_neon = 1;
 #else
-    ggml_aarch64_features.has_neon = 0;
+    ggml_arm_arch_features.has_neon = 0;
 #endif
 
 #if defined(__ARM_FEATURE_MATMUL_INT8)
-    ggml_aarch64_features.has_i8mm = 1;
+    ggml_arm_arch_features.has_i8mm = 1;
 #else
-    ggml_aarch64_features.has_i8mm = 0;
+    ggml_arm_arch_features.has_i8mm = 0;
 #endif
 
 #if defined(__ARM_FEATURE_SVE)
-    ggml_aarch64_features.has_sve = 1;
-    ggml_aarch64_features.sve_cnt = 16;
+    ggml_arm_arch_features.has_sve = 1;
+    ggml_arm_arch_features.sve_cnt = 16;
 #else
-    ggml_aarch64_features.has_sve = 0;
-    ggml_aarch64_features.sve_cnt = 0;
+    ggml_arm_arch_features.has_sve = 0;
+    ggml_arm_arch_features.sve_cnt = 0;
 #endif
 #endif
 }
@@ -3788,8 +3789,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
             GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
         }
 
-#if defined(__aarch64__)
-        ggml_init_aarch64_features();
+#if defined(__ARM_ARCH)
+        ggml_init_arm_arch_features();
 #endif
 
         is_first_call = false;
@@ -23641,16 +23642,16 @@ int ggml_cpu_has_fma(void) {
 }
 
 int ggml_cpu_has_neon(void) {
-#if defined(__aarch64__)
-    return ggml_aarch64_features.has_neon;
+#if defined(__ARM_ARCH)
+    return ggml_arm_arch_features.has_neon;
 #else
     return 0;
 #endif
 }
 
 int ggml_cpu_has_sve(void) {
-#if defined(__aarch64__)
-    return ggml_aarch64_features.has_sve;
+#if defined(__ARM_ARCH)
+    return ggml_arm_arch_features.has_sve;
 #else
     return 0;
 #endif
@@ -23797,16 +23798,16 @@ int ggml_cpu_has_vsx(void) {
 }
 
 int ggml_cpu_has_matmul_int8(void) {
-#if defined(__aarch64__)
-    return ggml_aarch64_features.has_i8mm;
+#if defined(__ARM_ARCH)
+    return ggml_arm_arch_features.has_i8mm;
 #else
     return 0;
 #endif
 }
 
 int ggml_cpu_get_sve_cnt(void) {
-#if defined(__aarch64__)
-    return ggml_aarch64_features.sve_cnt;
+#if defined(__ARM_ARCH)
+    return ggml_arm_arch_features.sve_cnt;
 #else
     return 0;
 #endif

From a48284c5f36d7881b24fa1dd1c41e28e0c9da56f Mon Sep 17 00:00:00 2001
From: Dan Johansson <dan.johansson@arm.com>
Date: Wed, 25 Sep 2024 12:42:57 +0200
Subject: [PATCH 3/3] ggml: Move definition of ggml_arm_arch_features to the
 global data section

---
 ggml/src/ggml.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 6eabf3aa8e906..fac4466e31d44 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -39,15 +39,6 @@
 #include <unistd.h>
 #endif
 
-#if defined(__ARM_ARCH)
-struct ggml_arm_arch_features_type {
-    int has_neon;
-    int has_i8mm;
-    int has_sve;
-    int sve_cnt;
-} ggml_arm_arch_features = {-1, -1, -1, 0};
-#endif
-
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
@@ -461,6 +452,15 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
 // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
 float ggml_table_f32_f16[1 << 16];
 
+#if defined(__ARM_ARCH)
+struct ggml_arm_arch_features_type {
+    int has_neon;
+    int has_i8mm;
+    int has_sve;
+    int sve_cnt;
+} ggml_arm_arch_features = {-1, -1, -1, 0};
+#endif
+
 GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
     switch (status) {
         case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";