Skip to content

Commit

Permalink
Do not quantize activations if not necessary #79
Browse files Browse the repository at this point in the history
Credits : Iwan Kawrakow @ikawrakow
  • Loading branch information
Nexesenex committed Dec 21, 2024
1 parent 2092f13 commit 705006f
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 23 deletions.
1 change: 1 addition & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ extern "C" {
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
size_t q_size;
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

int n_threads;
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-cpu/ggml-cpu-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ struct ggml_compute_params {

// work buffer for all threads
size_t wsize;
size_t qsize;
void * wdata;

struct ggml_threadpool * threadpool;
Expand Down
84 changes: 61 additions & 23 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -7980,7 +7980,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
return;
}

const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : (char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);

assert(ne12 % ne02 == 0);
Expand Down Expand Up @@ -8111,7 +8111,12 @@ UseGgmlGemm1:;
#endif

if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
char * wdata = (char *)params->wdata + params->wsize - params->qsize;

if (strncmp(src1->name, wdata - GGML_MAX_NAME, GGML_MAX_NAME) == 0) {
goto AlreadyQuantized;
}
wdata += GGML_MAX_NAME;

#if IK_PRINT_TIMING
int64_t t1 = ggml_time_us();
Expand All @@ -8121,7 +8126,7 @@ UseGgmlGemm1:;
const size_t nbw2 = nbw1*ne11;
const size_t nbw3 = nbw2*ne12;

assert(params->wsize >= ne13*nbw3);
assert(params->qsize >= ne13*nbw3);
GGML_ASSERT(src1->type == GGML_TYPE_F32);

for (int64_t i13 = 0; i13 < ne13; ++i13) {
Expand All @@ -8133,23 +8138,28 @@ UseGgmlGemm1:;
}
}
}

ggml_barrier(params->threadpool);

#if IK_PRINT_TIMING
int64_t t2 = ggml_time_us();
if (ith == 0) printf("quantize(%s): %d us\n", dst->name, (int)(t2 - t1));
#endif
}

if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
if (ith == 0) {
wdata -= GGML_MAX_NAME;
memcpy(wdata, src1->name, GGML_MAX_NAME);
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
//atomic_store(&params->shared->current_chunk, nth);
}

AlreadyQuantized:;
}

ggml_barrier(params->threadpool);
const void * wdata = (src1->type == vec_dot_type) ? src1->data
: (const void *)((const char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME);

#if GGML_USE_LLAMAFILE
if (src1->type != vec_dot_type) {
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);

for (int64_t i13 = 0; i13 < ne13; i13++)
Expand All @@ -8171,6 +8181,13 @@ UseGgmlGemm1:;
UseGgmlGemm2:;
#endif

if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
}

ggml_barrier(params->threadpool);

// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
const int64_t nr0 = ne0;

Expand Down Expand Up @@ -8273,9 +8290,10 @@ static void ggml_compute_forward_mul_mat_id(
const int n_ids = ids->ne[0]; // n_expert_used
const int n_as = ne02; // n_expert

char * wdata_src1_end = (src1->type == vec_dot_type) ?
(char *) params->wdata :
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
char * qdata = (char *)params->wdata + params->wsize - params->qsize;

char * wdata_src1_end = (src1->type == vec_dot_type) ? qdata :
qdata + GGML_PAD(GGML_MAX_NAME + ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));

struct mmid_row_mapping {
int32_t i1;
Expand All @@ -8285,14 +8303,19 @@ static void ggml_compute_forward_mul_mat_id(
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]

bool store_name = false;
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
if (strncmp(src1->name, qdata, GGML_MAX_NAME) == 0) {
goto QuantizationAlreadyDone;
}
store_name = true;
char * wdata = qdata + GGML_MAX_NAME;

const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
const size_t nbw2 = nbw1*ne11;
const size_t nbw3 = nbw2*ne12;

assert(params->wsize >= ne13*nbw3);
assert(params->qsize >= ne13*nbw3);
GGML_ASSERT(src1->type == GGML_TYPE_F32);

for (int64_t i13 = 0; i13 < ne13; ++i13) {
Expand All @@ -8308,7 +8331,12 @@ static void ggml_compute_forward_mul_mat_id(

#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]

QuantizationAlreadyDone:;
if (ith == 0) {
if (store_name) {
memcpy(qdata, src1->name, GGML_MAX_NAME);
}

// initialize matrix_row_counts
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));

Expand Down Expand Up @@ -8337,7 +8365,7 @@ static void ggml_compute_forward_mul_mat_id(

const char * src0_cur = (const char *) src0->data + cur_a*nb02;

const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : qdata + GGML_MAX_NAME;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);

const int64_t nr0 = ne01; // src0 rows
Expand Down Expand Up @@ -14361,6 +14389,7 @@ struct ggml_cplan ggml_graph_plan(
}

size_t work_size = 0;
size_t q_size = 0;

struct ggml_cplan cplan;
memset(&cplan, 0, sizeof(struct ggml_cplan));
Expand All @@ -14376,6 +14405,7 @@ struct ggml_cplan ggml_graph_plan(
max_tasks = MAX(max_tasks, n_tasks);

size_t cur = 0;
size_t cur_q = 0;

if (!ggml_cpu_extra_work_size(n_threads, node, &cur)) {

Expand Down Expand Up @@ -14413,12 +14443,13 @@ struct ggml_cplan ggml_graph_plan(

#if defined(GGML_USE_CLBLAST)
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
cur_q = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
} else
#endif

if (node->src[1]->type != vec_dot_type) {
size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
cur = MAX(cur, cur2);
size_t cur2_q = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
cur_q = MAX(cur_q, cur2_q);
}
} break;
case GGML_OP_MUL_MAT_ID:
Expand All @@ -14428,12 +14459,12 @@ struct ggml_cplan ggml_graph_plan(
const struct ggml_tensor * src1 = node->src[1];
const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
if (src1->type != vec_dot_type) {
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
cur_q += ggml_row_size(vec_dot_type, ggml_nelements(src1));
}
const int n_as = src0->ne[2];
cur += GGML_PAD(cur, sizeof(int64_t)); // align
cur += n_as * sizeof(int64_t); // matrix_row_counts
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
cur_q += GGML_PAD(cur, sizeof(int64_t)); // align
cur_q += n_as * sizeof(int64_t); // matrix_row_counts
cur_q += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
} break;
case GGML_OP_OUT_PROD:
{
Expand Down Expand Up @@ -14522,15 +14553,21 @@ struct ggml_cplan ggml_graph_plan(
}

work_size = MAX(work_size, cur);
q_size = MAX(q_size, cur_q);
}

if (work_size > 0) {
work_size += CACHE_LINE_SIZE*(n_threads);
}
if (q_size > 0) {
q_size += GGML_MAX_NAME;
}
work_size += q_size;

cplan.threadpool = threadpool;
cplan.n_threads = MIN(max_tasks, n_threads);
cplan.work_size = work_size;
cplan.q_size = q_size;
cplan.work_data = NULL;

return cplan;
Expand All @@ -14549,6 +14586,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
/*.ith =*/ state->ith,
/*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
/*.wsize =*/ cplan->work_size,
/*.qsize =*/ cplan->q_size,
/*.wdata =*/ cplan->work_data,
/*.threadpool=*/ tp,
};
Expand Down

0 comments on commit 705006f

Please sign in to comment.