From 705006f7af5597c1b53350c8188eadd9a071887e Mon Sep 17 00:00:00 2001
From: Nexesenex <124105151+Nexesenex@users.noreply.github.com>
Date: Thu, 19 Dec 2024 02:54:21 +0100
Subject: [PATCH]  Do not quantize activations if not necessary #79

Credits : Iwan Kawrakow @Ikawrakow
---
 ggml/include/ggml-cpu.h           |  1 +
 ggml/src/ggml-cpu/ggml-cpu-impl.h |  1 +
 ggml/src/ggml-cpu/ggml-cpu.c      | 84 ++++++++++++++++++++++---------
 3 files changed, 63 insertions(+), 23 deletions(-)

diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 3aa71badb5fb0..5965c4e10d6f7 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -11,6 +11,7 @@ extern "C" {
     // since https://github.com/ggerganov/ggml/issues/287
     struct ggml_cplan {
         size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        size_t    q_size;
         uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
 
         int n_threads;
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index d71076ad12b1f..66672a1c196de 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -21,6 +21,7 @@ struct ggml_compute_params {
 
     // work buffer for all threads
     size_t wsize;
+    size_t qsize;
     void * wdata;
 
     struct ggml_threadpool * threadpool;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index d0229edcfff8a..7f3e294337963 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -7980,7 +7980,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
         return;
     }
 
-    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const void * wdata = (src1->type == vec_dot_type) ? src1->data : (char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME;
     const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 
     assert(ne12 % ne02 == 0);
@@ -8111,7 +8111,12 @@ UseGgmlGemm1:;
 #endif
 
     if (src1->type != vec_dot_type) {
-        char * wdata = params->wdata;
+        char * wdata = (char *)params->wdata + params->wsize - params->qsize;
+
+        if (strncmp(src1->name, wdata - GGML_MAX_NAME, GGML_MAX_NAME) == 0) {
+            goto AlreadyQuantized;
+        }
+        wdata += GGML_MAX_NAME;
 
 #if IK_PRINT_TIMING
         int64_t t1 = ggml_time_us();
@@ -8121,7 +8126,7 @@ UseGgmlGemm1:;
         const size_t nbw2 = nbw1*ne11;
         const size_t nbw3 = nbw2*ne12;
 
-        assert(params->wsize >= ne13*nbw3);
+        assert(params->qsize >= ne13*nbw3);
         GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
         for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -8133,23 +8138,28 @@ UseGgmlGemm1:;
                 }
             }
         }
-
+        ggml_barrier(params->threadpool);
+	
 #if IK_PRINT_TIMING
         int64_t t2 = ggml_time_us();
         if (ith == 0) printf("quantize(%s): %d us\n", dst->name, (int)(t2 - t1));
 #endif
-    }
 
-    if (ith == 0) {
-        // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
+        if (ith == 0) {
+            wdata -= GGML_MAX_NAME;
+            memcpy(wdata, src1->name, GGML_MAX_NAME);
+            // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+            //atomic_store(&params->shared->current_chunk, nth);
+        }
+
+AlreadyQuantized:;
     }
 
-    ggml_barrier(params->threadpool);
+    const void * wdata = (src1->type == vec_dot_type) ? src1->data
+                       : (const void *)((const char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME);
 
 #if GGML_USE_LLAMAFILE
     if (src1->type != vec_dot_type) {
-        const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
         const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 
         for (int64_t i13 = 0; i13 < ne13; i13++)
@@ -8171,6 +8181,13 @@ UseGgmlGemm1:;
 UseGgmlGemm2:;
 #endif
 
+    if (ith == 0) {
+        // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+        atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
+    }
+
+    ggml_barrier(params->threadpool);
+
     // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
     const int64_t nr0 = ne0;
 
@@ -8273,9 +8290,10 @@ static void ggml_compute_forward_mul_mat_id(
     const int n_ids = ids->ne[0]; // n_expert_used
     const int n_as  = ne02;       // n_expert
 
-    char * wdata_src1_end = (src1->type == vec_dot_type) ?
-            (char *) params->wdata :
-            (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
+    char * qdata = (char *)params->wdata + params->wsize - params->qsize;
+
+    char * wdata_src1_end = (src1->type == vec_dot_type) ?  qdata :
+            qdata + GGML_PAD(GGML_MAX_NAME + ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
 
     struct mmid_row_mapping {
         int32_t i1;
@@ -8285,14 +8303,19 @@ static void ggml_compute_forward_mul_mat_id(
     int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
     struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
 
+    bool store_name = false;
     if (src1->type != vec_dot_type) {
-        char * wdata = params->wdata;
+        if (strncmp(src1->name, qdata, GGML_MAX_NAME) == 0) {
+            goto QuantizationAlreadyDone;
+        }
+        store_name = true;
+        char * wdata = qdata + GGML_MAX_NAME;
 
         const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
         const size_t nbw2 = nbw1*ne11;
         const size_t nbw3 = nbw2*ne12;
 
-        assert(params->wsize >= ne13*nbw3);
+        assert(params->qsize >= ne13*nbw3);
         GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
         for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -8308,7 +8331,12 @@ static void ggml_compute_forward_mul_mat_id(
 
 #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
 
+QuantizationAlreadyDone:;
     if (ith == 0) {
+        if (store_name) {
+            memcpy(qdata, src1->name, GGML_MAX_NAME);
+        }
+
         // initialize matrix_row_counts
         memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
 
@@ -8337,7 +8365,7 @@ static void ggml_compute_forward_mul_mat_id(
 
         const char * src0_cur = (const char *) src0->data + cur_a*nb02;
 
-        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : qdata + GGML_MAX_NAME;
         const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 
         const int64_t nr0 = ne01; // src0 rows
@@ -14361,6 +14389,7 @@ struct ggml_cplan ggml_graph_plan(
     }
 
     size_t work_size = 0;
+    size_t q_size = 0;
 
     struct ggml_cplan cplan;
     memset(&cplan, 0, sizeof(struct ggml_cplan));
@@ -14376,6 +14405,7 @@ struct ggml_cplan ggml_graph_plan(
         max_tasks = MAX(max_tasks, n_tasks);
 
         size_t cur = 0;
+        size_t cur_q = 0;
 
         if (!ggml_cpu_extra_work_size(n_threads, node, &cur)) {
 
@@ -14413,12 +14443,13 @@ struct ggml_cplan ggml_graph_plan(
 
 #if defined(GGML_USE_CLBLAST)
                         if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
-                            cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
+                            cur_q = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
                         } else
 #endif
+
                         if (node->src[1]->type != vec_dot_type) {
-                            size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
-                            cur = MAX(cur, cur2);
+                            size_t cur2_q = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
+                            cur_q = MAX(cur_q, cur2_q);
                         }
                     } break;
                 case GGML_OP_MUL_MAT_ID:
@@ -14428,12 +14459,12 @@ struct ggml_cplan ggml_graph_plan(
                         const struct ggml_tensor * src1 = node->src[1];
                         const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
                         if (src1->type != vec_dot_type) {
-                            cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
+                            cur_q += ggml_row_size(vec_dot_type, ggml_nelements(src1));
                         }
                         const int n_as = src0->ne[2];
-                        cur += GGML_PAD(cur, sizeof(int64_t));       // align
-                        cur += n_as * sizeof(int64_t);               // matrix_row_counts
-                        cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
+                        cur_q += GGML_PAD(cur, sizeof(int64_t));       // align
+                        cur_q += n_as * sizeof(int64_t);               // matrix_row_counts
+                        cur_q += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
                     } break;
                 case GGML_OP_OUT_PROD:
                     {
@@ -14522,15 +14553,21 @@ struct ggml_cplan ggml_graph_plan(
         }
 
         work_size = MAX(work_size, cur);
+        q_size    = MAX(q_size, cur_q);
     }
 
     if (work_size > 0) {
         work_size += CACHE_LINE_SIZE*(n_threads);
     }
+    if (q_size > 0) {
+        q_size += GGML_MAX_NAME;
+    }
+    work_size += q_size;
 
     cplan.threadpool = threadpool;
     cplan.n_threads  = MIN(max_tasks, n_threads);
     cplan.work_size  = work_size;
+    cplan.q_size    = q_size;
     cplan.work_data  = NULL;
 
     return cplan;
@@ -14549,6 +14586,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         /*.ith       =*/ state->ith,
         /*.nth       =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
         /*.wsize     =*/ cplan->work_size,
+        /*.qsize     =*/ cplan->q_size,
         /*.wdata     =*/ cplan->work_data,
         /*.threadpool=*/ tp,
     };