From a2beaffec8fbf95053c979e94739fc8a7b62a6e7 Mon Sep 17 00:00:00 2001 From: Alan Gray Date: Fri, 26 Apr 2024 06:16:56 -0700 Subject: [PATCH 1/3] Reset schedule earlier to allow overlap with graph computation on device Refs #6763 --- ggml-backend.c | 13 ++++++++----- llama.cpp | 5 +++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 402d86ef3ac8b..03a0f17575246 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1780,12 +1780,15 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { void ggml_backend_sched_reset(ggml_backend_sched_t sched) { // reset state for the next run - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT - memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); - memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); + if(!sched->is_reset) + { + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT + memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); + memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); - sched->is_reset = true; + sched->is_reset = true; + } sched->is_alloc = false; } diff --git a/llama.cpp b/llama.cpp index fa7c022f29130..86615a3f1a5a4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -16773,6 +16773,11 @@ float * llama_get_logits(struct llama_context * ctx) { float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { int32_t j = -1; + + // Reset state for the next run before the following backend sync, + // to allow the CPU activities in the reset to overlap with device computation. + ggml_backend_sched_reset(ctx->sched); + llama_synchronize(ctx); try { From 34847caa9a241f07d6e4332b55231f0404de1176 Mon Sep 17 00:00:00 2001 From: Alan Gray Date: Fri, 26 Apr 2024 10:24:02 -0700 Subject: [PATCH 2/3] moved reset to end of llama_decode_internal --- llama.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index 86615a3f1a5a4..b4da31e754b1c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11205,6 +11205,10 @@ static int llama_decode_internal( } } + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(lctx.sched); + return 0; } @@ -16773,11 +16777,6 @@ float * llama_get_logits(struct llama_context * ctx) { float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { int32_t j = -1; - - // Reset state for the next run before the following backend sync, - // to allow the CPU activities in the reset to overlap with device computation. - ggml_backend_sched_reset(ctx->sched); - llama_synchronize(ctx); try { From 728562bc129ff0ba59ee9e43b4a13288737d76a6 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 26 Apr 2024 20:08:05 +0200 Subject: [PATCH 3/3] style fix --- ggml-backend.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 03a0f17575246..f46ae7b24bfc9 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1780,8 +1780,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { void ggml_backend_sched_reset(ggml_backend_sched_t sched) { // reset state for the next run - if(!sched->is_reset) - { + if (!sched->is_reset) { size_t hash_size = sched->hash_set.size; memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);