From 705006f7af5597c1b53350c8188eadd9a071887e Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Thu, 19 Dec 2024 02:54:21 +0100 Subject: [PATCH] Do not quantize activations if not necessary #79 Credits : Iwan Kawrakow @Ikawrakow --- ggml/include/ggml-cpu.h | 1 + ggml/src/ggml-cpu/ggml-cpu-impl.h | 1 + ggml/src/ggml-cpu/ggml-cpu.c | 84 ++++++++++++++++++++++--------- 3 files changed, 63 insertions(+), 23 deletions(-) diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 3aa71badb5fb0..5965c4e10d6f7 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -11,6 +11,7 @@ extern "C" { // since https://github.com/ggerganov/ggml/issues/287 struct ggml_cplan { size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` + size_t q_size; uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` int n_threads; diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index d71076ad12b1f..66672a1c196de 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -21,6 +21,7 @@ struct ggml_compute_params { // work buffer for all threads size_t wsize; + size_t qsize; void * wdata; struct ggml_threadpool * threadpool; diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index d0229edcfff8a..7f3e294337963 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -7980,7 +7980,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( return; } - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const void * wdata = (src1->type == vec_dot_type) ? src1->data : (char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME; const size_t row_size = ggml_row_size(vec_dot_type, ne10); assert(ne12 % ne02 == 0); @@ -8111,7 +8111,12 @@ UseGgmlGemm1:; #endif if (src1->type != vec_dot_type) { - char * wdata = params->wdata; + char * wdata = (char *)params->wdata + params->wsize - params->qsize; + + if (strncmp(src1->name, wdata - GGML_MAX_NAME, GGML_MAX_NAME) == 0) { + goto AlreadyQuantized; + } + wdata += GGML_MAX_NAME; #if IK_PRINT_TIMING int64_t t1 = ggml_time_us(); @@ -8121,7 +8126,7 @@ UseGgmlGemm1:; const size_t nbw2 = nbw1*ne11; const size_t nbw3 = nbw2*ne12; - assert(params->wsize >= ne13*nbw3); + assert(params->qsize >= ne13*nbw3); GGML_ASSERT(src1->type == GGML_TYPE_F32); for (int64_t i13 = 0; i13 < ne13; ++i13) { @@ -8133,23 +8138,28 @@ UseGgmlGemm1:; } } } - + ggml_barrier(params->threadpool); + #if IK_PRINT_TIMING int64_t t2 = ggml_time_us(); if (ith == 0) printf("quantize(%s): %d us\n", dst->name, (int)(t2 - t1)); #endif - } - if (ith == 0) { - // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. - atomic_store_explicit(¶ms->threadpool->current_chunk, nth, memory_order_relaxed); + if (ith == 0) { + wdata -= GGML_MAX_NAME; + memcpy(wdata, src1->name, GGML_MAX_NAME); + // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. + //atomic_store(¶ms->shared->current_chunk, nth); + } + +AlreadyQuantized:; } - ggml_barrier(params->threadpool); + const void * wdata = (src1->type == vec_dot_type) ? src1->data + : (const void *)((const char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME); #if GGML_USE_LLAMAFILE if (src1->type != vec_dot_type) { - const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); for (int64_t i13 = 0; i13 < ne13; i13++) @@ -8171,6 +8181,13 @@ UseGgmlGemm1:; UseGgmlGemm2:; #endif + if (ith == 0) { + // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. + atomic_store_explicit(¶ms->threadpool->current_chunk, nth, memory_order_relaxed); + } + + ggml_barrier(params->threadpool); + // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) const int64_t nr0 = ne0; @@ -8273,9 +8290,10 @@ static void ggml_compute_forward_mul_mat_id( const int n_ids = ids->ne[0]; // n_expert_used const int n_as = ne02; // n_expert - char * wdata_src1_end = (src1->type == vec_dot_type) ? - (char *) params->wdata : - (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); + char * qdata = (char *)params->wdata + params->wsize - params->qsize; + + char * wdata_src1_end = (src1->type == vec_dot_type) ? qdata : + qdata + GGML_PAD(GGML_MAX_NAME + ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); struct mmid_row_mapping { int32_t i1; @@ -8285,14 +8303,19 @@ static void ggml_compute_forward_mul_mat_id( int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11] + bool store_name = false; if (src1->type != vec_dot_type) { - char * wdata = params->wdata; + if (strncmp(src1->name, qdata, GGML_MAX_NAME) == 0) { + goto QuantizationAlreadyDone; + } + store_name = true; + char * wdata = qdata + GGML_MAX_NAME; const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); const size_t nbw2 = nbw1*ne11; const size_t nbw3 = nbw2*ne12; - assert(params->wsize >= ne13*nbw3); + assert(params->qsize >= ne13*nbw3); GGML_ASSERT(src1->type == GGML_TYPE_F32); for (int64_t i13 = 0; i13 < ne13; ++i13) { @@ -8308,7 +8331,12 @@ static void ggml_compute_forward_mul_mat_id( #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)] +QuantizationAlreadyDone:; if (ith == 0) { + if (store_name) { + memcpy(qdata, src1->name, GGML_MAX_NAME); + } + // initialize matrix_row_counts memset(matrix_row_counts, 0, n_as*sizeof(int64_t)); @@ -8337,7 +8365,7 @@ static void ggml_compute_forward_mul_mat_id( const char * src0_cur = (const char *) src0->data + cur_a*nb02; - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const void * wdata = (src1->type == vec_dot_type) ? src1->data : qdata + GGML_MAX_NAME; const size_t row_size = ggml_row_size(vec_dot_type, ne10); const int64_t nr0 = ne01; // src0 rows @@ -14361,6 +14389,7 @@ struct ggml_cplan ggml_graph_plan( } size_t work_size = 0; + size_t q_size = 0; struct ggml_cplan cplan; memset(&cplan, 0, sizeof(struct ggml_cplan)); @@ -14376,6 +14405,7 @@ struct ggml_cplan ggml_graph_plan( max_tasks = MAX(max_tasks, n_tasks); size_t cur = 0; + size_t cur_q = 0; if (!ggml_cpu_extra_work_size(n_threads, node, &cur)) { @@ -14413,12 +14443,13 @@ struct ggml_cplan ggml_graph_plan( #if defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) { - cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node); + cur_q = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node); } else #endif + if (node->src[1]->type != vec_dot_type) { - size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); - cur = MAX(cur, cur2); + size_t cur2_q = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); + cur_q = MAX(cur_q, cur2_q); } } break; case GGML_OP_MUL_MAT_ID: @@ -14428,12 +14459,12 @@ struct ggml_cplan ggml_graph_plan( const struct ggml_tensor * src1 = node->src[1]; const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; if (src1->type != vec_dot_type) { - cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); + cur_q += ggml_row_size(vec_dot_type, ggml_nelements(src1)); } const int n_as = src0->ne[2]; - cur += GGML_PAD(cur, sizeof(int64_t)); // align - cur += n_as * sizeof(int64_t); // matrix_row_counts - cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows + cur_q += GGML_PAD(cur, sizeof(int64_t)); // align + cur_q += n_as * sizeof(int64_t); // matrix_row_counts + cur_q += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows } break; case GGML_OP_OUT_PROD: { @@ -14522,15 +14553,21 @@ struct ggml_cplan ggml_graph_plan( } work_size = MAX(work_size, cur); + q_size = MAX(q_size, cur_q); } if (work_size > 0) { work_size += CACHE_LINE_SIZE*(n_threads); } + if (q_size > 0) { + q_size += GGML_MAX_NAME; + } + work_size += q_size; cplan.threadpool = threadpool; cplan.n_threads = MIN(max_tasks, n_threads); cplan.work_size = work_size; + cplan.q_size = q_size; cplan.work_data = NULL; return cplan; @@ -14549,6 +14586,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.ith =*/ state->ith, /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed), /*.wsize =*/ cplan->work_size, + /*.qsize =*/ cplan->q_size, /*.wdata =*/ cplan->work_data, /*.threadpool=*/ tp, };