From 4d0383321184aadf91968d9e3c6a45286ed2473b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Sat, 7 Oct 2023 22:14:10 +0300
Subject: [PATCH 01/26] gguf.py : fix CI for publishing GGUF package (#3532)

* Fix CI for publishing GGUF package

* Bump version

* fix

* bump version

* bump version

* bump version
---
 .github/workflows/gguf-publish.yml | 3 ++-
 gguf-py/README.md                  | 1 -
 gguf-py/pyproject.toml             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml
index e61bfc6c32b60..57db175122c03 100644
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -36,8 +36,9 @@ jobs:
         poetry install
 
     - name: Build package
-      run: poetry build
+      run: cd gguf-py && poetry build
     - name: Publish package
       uses: pypa/gh-action-pypi-publish@release/v1
       with:
         password: ${{ secrets.PYPI_API_TOKEN }}
+        packages-dir: gguf-py/dist
diff --git a/gguf-py/README.md b/gguf-py/README.md
index ffe25c495ab1e..a28d8c57adc7d 100644
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@@ -69,4 +69,3 @@ python -m twine upload dist/*
 ## TODO
 - [ ] Add tests
 - [ ] Include conversion scripts as command line entry points in this package.
-- Add CI workflow for releasing the package.
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index 400607ce1dc30..07a7ab4dd11fc 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.4.0"
+version = "0.4.4"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [

From a16e89cec83b4bd5f6af8f1ce1400f94c12356f9 Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Sat, 7 Oct 2023 15:31:41 -0600
Subject: [PATCH 02/26] Fix trying to strip newline from empty prompt and cfg
 prompt file content (#3534)

---
 common/common.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 60b00b5fbb8f1..0f55c33a713a7 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -170,7 +170,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             // store the external file name in params
             params.prompt_file = argv[i];
             std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (params.prompt.back() == '\n') {
+            if (!params.prompt.empty() && params.prompt.back() == '\n') {
                 params.prompt.pop_back();
             }
         } else if (arg == "-n" || arg == "--n-predict") {
@@ -295,7 +295,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
-            if (params.cfg_negative_prompt.back() == '\n') {
+            if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
                 params.cfg_negative_prompt.pop_back();
             }
         } else if (arg == "--cfg-scale") {

From 63d3b06a4318329f92b078e8aa0be7ab6e9f871f Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Sat, 7 Oct 2023 23:22:17 -0600
Subject: [PATCH 03/26] llama : fix missing break in Persimmon arch case
 statements (#3535)

---
 llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d10656bb801db..a4312ab72d613 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2047,7 +2047,7 @@ static void llm_load_hparams(
                 case 36: model.type = e_model::MODEL_8B; break;
                 default: model.type = e_model::MODEL_UNKNOWN;
             }
-        }
+        } break;
         case LLM_ARCH_REFACT:
             {
                 GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@@ -4926,7 +4926,7 @@ static struct ggml_cgraph * llama_build_graph(
         case LLM_ARCH_PERSIMMON:
             {
                 result = llm_build_persimmon(lctx, batch);
-            }
+            } break;
         case LLM_ARCH_REFACT:
             {
                 result = llm_build_refact(lctx, batch);

From b0ec5218c3d24755786b80ecce9cf4ffc07583f8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 8 Oct 2023 10:01:53 +0300
Subject: [PATCH 04/26] metal : support MTLGPUFamily < Apple7, formatting,
 style (#3524)

* metal : improve decoding speed for batches of 2-16

* metal : rename kernels mul_mat_ to mul_mv_

* metal : indentations

* minor

* metal : print more GPU info + disable mul_mm for MTLGPUFamiliy < Apple7
---
 ggml-metal.m     | 202 +++++++++++++++++++++++++++++------------------
 ggml-metal.metal |  92 ++++++++++++---------
 2 files changed, 176 insertions(+), 118 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index f8fa05dd9f4a7..57c238ddaaf67 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -81,18 +81,18 @@
     GGML_METAL_DECL_KERNEL(get_rows_q6_K);
     GGML_METAL_DECL_KERNEL(rms_norm);
     GGML_METAL_DECL_KERNEL(norm);
-    GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
-    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
-    GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
+    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
+    GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
@@ -262,28 +262,30 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
         GGML_METAL_ADD_KERNEL(get_rows_q6_K);
         GGML_METAL_ADD_KERNEL(rms_norm);
         GGML_METAL_ADD_KERNEL(norm);
-        GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
-        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
-        GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
+        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
+        GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
+        if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
+            GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
+        }
         GGML_METAL_ADD_KERNEL(rope_f32);
         GGML_METAL_ADD_KERNEL(rope_f16);
         GGML_METAL_ADD_KERNEL(alibi_f32);
@@ -296,8 +298,22 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
 #undef GGML_METAL_ADD_KERNEL
     }
 
-    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
 #if TARGET_OS_OSX
+    // print MTL GPU family:
+    GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
+    GGML_METAL_LOG_INFO("%s: GPU arch:   %s\n", __func__, [[ctx->device architecture].name UTF8String]);
+
+    // determine max supported GPU family
+    // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
+    // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+    for (int i = MTLGPUFamilyApple9 + 10; i >= MTLGPUFamilyApple1; --i) {
+        if ([ctx->device supportsFamily:i]) {
+            GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
+            break;
+        }
+    }
+
+    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
     GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
     if (ctx->device.maxTransferRate != 0) {
         GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
@@ -339,28 +355,30 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_DEL_KERNEL(get_rows_q6_K);
     GGML_METAL_DEL_KERNEL(rms_norm);
     GGML_METAL_DEL_KERNEL(norm);
-    GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
-    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
-    GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
+    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
+    GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
+    if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
+        GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
+    }
     GGML_METAL_DEL_KERNEL(rope_f32);
     GGML_METAL_DEL_KERNEL(rope_f16);
     GGML_METAL_DEL_KERNEL(alibi_f32);
@@ -986,21 +1004,46 @@ void ggml_metal_graph_compute(
                         } break;
                     case GGML_OP_MUL_MAT:
                         {
-                            // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
-
                             GGML_ASSERT(ne00 == ne10);
-                            // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
-                            uint gqa = ne12/ne02;
                             GGML_ASSERT(ne03 == ne13);
 
+                            const uint gqa = ne12/ne02;
+
+                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
+                            // to the matrix-vector kernel
+                            int ne11_mm_min = 1;
+
+#if 0
+                            // the numbers below are measured on M2 Ultra for 7B and 13B models
+                            // these numbers do not translate to other devices or model sizes
+                            // TODO: need to find a better approach
+                            if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
+                                switch (src0t) {
+                                    case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
+                                    case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
+                                    case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
+                                    case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
+                                    case GGML_TYPE_Q4_0:
+                                    case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
+                                    case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
+                                    case GGML_TYPE_Q5_0:                          // not tested yet
+                                    case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
+                                    case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
+                                    case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
+                                    default:             ne11_mm_min = 1;  break;
+                                }
+                            }
+#endif
+
                             // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                             // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            if (!ggml_is_transposed(src0) &&
+                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                                !ggml_is_transposed(src0) &&
                                 !ggml_is_transposed(src1) &&
                                 src1t == GGML_TYPE_F32 &&
-                                [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                                ne00%32 == 0 &&
-                                ne11 > 2) {
+                                ne00 % 32 == 0 &&
+                                ne11 > ne11_mm_min) {
+                                //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
                                 switch (src0->type) {
                                     case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32];  break;
                                     case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
@@ -1029,17 +1072,18 @@ void ggml_metal_graph_compute(
                                 [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
                                 [encoder setBytes:&gqa     length:sizeof(gqa)  atIndex:13];
                                 [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                             } else {
                                 int nth0 = 32;
                                 int nth1 = 1;
                                 int nrows = 1;
+                                //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
 
                                 // use custom matrix x vector kernel
                                 switch (src0t) {
                                     case GGML_TYPE_F32:
                                         {
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
                                             nrows = 4;
                                         } break;
                                     case GGML_TYPE_F16:
@@ -1047,12 +1091,12 @@ void ggml_metal_graph_compute(
                                             nth0 = 32;
                                             nth1 = 1;
                                             if (ne11 * ne12 < 4) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
                                             } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
                                                 nrows = ne11;
                                             } else {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
                                                 nrows = 4;
                                             }
                                         } break;
@@ -1063,7 +1107,7 @@ void ggml_metal_graph_compute(
 
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32];
                                         } break;
                                     case GGML_TYPE_Q4_1:
                                         {
@@ -1072,7 +1116,7 @@ void ggml_metal_graph_compute(
 
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
                                         } break;
                                     case GGML_TYPE_Q8_0:
                                         {
@@ -1081,7 +1125,7 @@ void ggml_metal_graph_compute(
 
                                             nth0 = 8;
                                             nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32];
                                         } break;
                                     case GGML_TYPE_Q2_K:
                                         {
@@ -1090,7 +1134,7 @@ void ggml_metal_graph_compute(
 
                                             nth0 = 2;
                                             nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32];
                                         } break;
                                     case GGML_TYPE_Q3_K:
                                         {
@@ -1099,7 +1143,7 @@ void ggml_metal_graph_compute(
 
                                             nth0 = 2;
                                             nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32];
                                         } break;
                                     case GGML_TYPE_Q4_K:
                                         {
@@ -1108,7 +1152,7 @@ void ggml_metal_graph_compute(
 
                                             nth0 = 4; //1;
                                             nth1 = 8; //32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32];
                                         } break;
                                     case GGML_TYPE_Q5_K:
                                         {
@@ -1117,7 +1161,7 @@ void ggml_metal_graph_compute(
 
                                             nth0 = 2;
                                             nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32];
                                         } break;
                                     case GGML_TYPE_Q6_K:
                                         {
@@ -1126,7 +1170,7 @@ void ggml_metal_graph_compute(
 
                                             nth0 = 2;
                                             nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
                                         } break;
                                     default:
                                         {
@@ -1155,7 +1199,7 @@ void ggml_metal_graph_compute(
                                 [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];
 
                                 if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
-                                    src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
+                                    src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
                                     [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 }
                                 else if (src0t == GGML_TYPE_Q4_K) {
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 9bd94e82bfe1e..b6288db28660d 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -13,8 +13,8 @@ typedef struct {
 
 #define QK4_1 32
 typedef struct {
-    half d;          // delta
-    half m;          // min
+    half d;                 // delta
+    half m;                 // min
     uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
 
@@ -423,8 +423,8 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
 }
 
 // putting them in the kernel cause a significant performance penalty
-#define N_DST 4 // each SIMD group works on 4 rows
-#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
+#define N_DST 4        // each SIMD group works on 4 rows
+#define N_SIMDGROUP 2  // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
 //Note: This is a template, but strictly speaking it only applies to
 //      quantizations where the block size is 32. It also does not
@@ -435,18 +435,23 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
                     int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
                     uint3 tgpig, uint tiisg, uint sgitg) {
     const int nb = ne00/QK4_0;
+
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int im = tgpig.z;
+
     const int first_row = (r0 * nsg + sgitg) * nr;
+
     const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
+
     device const block_q_type * x = (device const block_q_type *) src0 + offset0;
     device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
-    float yl[16];       // src1 vector cache
-    float sumf[nr]={0.f};
 
-    const int ix = tiisg/2;
-    const int il = 8*(tiisg%2);
+    float yl[16]; // src1 vector cache
+    float sumf[nr] = {0.f};
+
+    const int ix = (tiisg/2);
+    const int il = (tiisg%2)*8;
 
     device const float * yb = y + ix * QK4_0 + il;
 
@@ -457,6 +462,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
             sumy += yb[i] + yb[i+1];
             yl[i+0] = yb[i+ 0];
             yl[i+1] = yb[i+ 1]/256.f;
+
             sumy += yb[i+16] + yb[i+17];
             yl[i+8] = yb[i+16]/16.f;
             yl[i+9] = yb[i+17]/4096.f;
@@ -472,12 +478,12 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
     for (int row = 0; row < nr; ++row) {
         const float tot = simd_sum(sumf[row]);
         if (tiisg == 0 && first_row + row < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
+            dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
         }
     }
 }
 
-kernel void kernel_mul_mat_q4_0_f32(
+kernel void kernel_mul_mv_q4_0_f32(
         device const  void * src0,
         device const float * src1,
         device       float * dst,
@@ -490,12 +496,12 @@ kernel void kernel_mul_mat_q4_0_f32(
         constant   int64_t & ne1[[buffer(16)]],
         constant   uint    & gqa[[buffer(17)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
     mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
 }
 
-kernel void kernel_mul_mat_q4_1_f32(
+kernel void kernel_mul_mv_q4_1_f32(
         device const  void * src0,
         device const float * src1,
         device       float * dst,
@@ -515,7 +521,7 @@ kernel void kernel_mul_mat_q4_1_f32(
 
 #define NB_Q8_0 8
 
-kernel void kernel_mul_mat_q8_0_f32(
+kernel void kernel_mul_mv_q8_0_f32(
         device const  void * src0,
         device const float * src1,
         device       float * dst,
@@ -579,7 +585,7 @@ kernel void kernel_mul_mat_q8_0_f32(
 
 #define N_F32_F32 4
 
-kernel void kernel_mul_mat_f32_f32(
+kernel void kernel_mul_mv_f32_f32(
         device const  char * src0,
         device const  char * src1,
         device       float * dst,
@@ -650,7 +656,7 @@ kernel void kernel_mul_mat_f32_f32(
     }
 }
 
-kernel void kernel_mul_mat_f16_f32_1row(
+kernel void kernel_mul_mv_f16_f32_1row(
         device const  char * src0,
         device const  char * src1,
         device       float * dst,
@@ -669,7 +675,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
         constant   int64_t & ne0,
         constant   int64_t & ne1,
         uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]]) {
+        uint  tiisg[[thread_index_in_simdgroup]]) {
 
     const int64_t r0 = tgpig.x;
     const int64_t r1 = tgpig.y;
@@ -704,7 +710,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
 
 #define N_F16_F32 4
 
-kernel void kernel_mul_mat_f16_f32(
+kernel void kernel_mul_mv_f16_f32(
         device const  char * src0,
         device const  char * src1,
         device       float * dst,
@@ -776,7 +782,7 @@ kernel void kernel_mul_mat_f16_f32(
 }
 
 // Assumes row size (ne00) is a multiple of 4
-kernel void kernel_mul_mat_f16_f32_l4(
+kernel void kernel_mul_mv_f16_f32_l4(
         device const  char * src0,
         device const  char * src1,
         device       float * dst,
@@ -1253,7 +1259,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
 
 //====================================== dot products =========================
 
-kernel void kernel_mul_mat_q2_K_f32(
+kernel void kernel_mul_mv_q2_K_f32(
         device const  void * src0,
         device const float * src1,
         device       float * dst,
@@ -1397,7 +1403,7 @@ kernel void kernel_mul_mat_q2_K_f32(
 }
 
 #if QK_K == 256
-kernel void kernel_mul_mat_q3_K_f32(
+kernel void kernel_mul_mv_q3_K_f32(
         device const  void * src0,
         device const float * src1,
         device       float * dst,
@@ -1549,7 +1555,7 @@ kernel void kernel_mul_mat_q3_K_f32(
     }
 }
 #else
-kernel void kernel_mul_mat_q3_K_f32(
+kernel void kernel_mul_mv_q3_K_f32(
         device const  void * src0,
         device const float * src1,
         device       float * dst,
@@ -1620,7 +1626,7 @@ kernel void kernel_mul_mat_q3_K_f32(
 #endif
 
 #if QK_K == 256
-kernel void kernel_mul_mat_q4_K_f32(
+kernel void kernel_mul_mv_q4_K_f32(
         device const  void * src0,
         device const float * src1,
         device       float * dst,
@@ -1726,7 +1732,7 @@ kernel void kernel_mul_mat_q4_K_f32(
     }
 }
 #else
-kernel void kernel_mul_mat_q4_K_f32(
+kernel void kernel_mul_mv_q4_K_f32(
         device const  void * src0,
         device const float * src1,
         device       float * dst,
@@ -1815,7 +1821,7 @@ kernel void kernel_mul_mat_q4_K_f32(
 }
 #endif
 
-kernel void kernel_mul_mat_q5_K_f32(
+kernel void kernel_mul_mv_q5_K_f32(
         device const  void * src0,
         device const float * src1,
         device       float * dst,
@@ -1988,7 +1994,7 @@ kernel void kernel_mul_mat_q5_K_f32(
 
 }
 
-kernel void kernel_mul_mat_q6_K_f32(
+kernel void kernel_mul_mv_q6_K_f32(
         device const  void * src0,
         device const float * src1,
         device       float * dst,
@@ -2326,7 +2332,7 @@ kernel void kernel_get_rows(
 }
 
 #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
-#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A
+#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
 #define BLOCK_SIZE_K 32
 #define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
 #define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
@@ -2363,9 +2369,11 @@ kernel void kernel_mul_mm(device const  uchar * src0,
     const uint r0 = tgpig.y;
     const uint r1 = tgpig.x;
     const uint im = tgpig.z;
+
     // if this block is of 64x32 shape or smaller
     short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
     short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
+
     // a thread shouldn't load data outside of the matrix
     short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
     short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
@@ -2389,26 +2397,30 @@ kernel void kernel_mul_mm(device const  uchar * src0,
         + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
 
     for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
-        //load data and store to threadgroup memory
+        // load data and store to threadgroup memory
         half4x4 temp_a;
         dequantize_func(x, il, temp_a);
         threadgroup_barrier(mem_flags::mem_threadgroup);
+
         #pragma unroll(16)
         for (int i = 0; i < 16; i++) {
             *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-            + 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \
-            + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
+            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
+            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
         }
-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \
-                = *((device float2x4 *)y);
+
+        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
+
         il = (il + 2 < nl) ? il + 2 : il % 2;
         x  = (il < 2) ? x + (2+nl-1)/nl : x;
         y += BLOCK_SIZE_K;
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        //load matrices from threadgroup memory and conduct outer products
+
+        // load matrices from threadgroup memory and conduct outer products
         threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
         threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
+
         #pragma unroll(4)
         for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
             #pragma unroll(4)
@@ -2423,6 +2435,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
 
             lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
             lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
+
             #pragma unroll(8)
             for (int i = 0; i < 8; i++){
                 simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
@@ -2431,25 +2444,26 @@ kernel void kernel_mul_mm(device const  uchar * src0,
     }
 
     if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
-        device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \
-                          + (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0;
+        device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) \
+                               + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
         for (int i = 0; i < 8; i++) {
             simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
         }
     } else {
         // block is smaller than 64x32, we should avoid writing data outside of the matrix
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
         for (int i = 0; i < 8; i++) {
             simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
         }
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
-        if (sgitg==0) {
+
+        device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
+        if (sgitg == 0) {
             for (int i = 0; i < n_rows; i++) {
-                for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) {
+                for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
                     *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
                 }
             }

From 7d8b24932fe788a4cda76459a0c5df3e0073cb98 Mon Sep 17 00:00:00 2001
From: Luo Tian <lt@basecity.com>
Date: Sun, 8 Oct 2023 16:24:01 +0800
Subject: [PATCH 05/26] zig : fix build by introducing train.cpp (#3539)

---
 build.zig | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/build.zig b/build.zig
index 3a8978bc37fe2..b95491e03f12f 100644
--- a/build.zig
+++ b/build.zig
@@ -111,12 +111,14 @@ pub fn build(b: *std.build.Builder) !void {
     const common = make.obj("common", "common/common.cpp");
     const console = make.obj("common", "common/console.cpp");
     const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
+    const train = make.obj("train", "common/train.cpp");
 
     _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
     _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
     _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
     _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, llama, common, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common, train });
 
     const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
     if (server.target.isWindows()) {

From 94e502dfb79430870b42b8e8ee132b4aaa93e4a8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 8 Oct 2023 11:24:50 +0300
Subject: [PATCH 06/26] ci : enable on obj-c changes + fix metal build (#3540)

---
 .github/workflows/build.yml | 4 ++--
 ggml-metal.m                | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c1e36ee28d202..e41be76db0820 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ on:
   push:
     branches:
       - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
 
 env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
diff --git a/ggml-metal.m b/ggml-metal.m
index 57c238ddaaf67..92956ed97e9c9 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -301,12 +301,11 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
 #if TARGET_OS_OSX
     // print MTL GPU family:
     GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
-    GGML_METAL_LOG_INFO("%s: GPU arch:   %s\n", __func__, [[ctx->device architecture].name UTF8String]);
 
     // determine max supported GPU family
     // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
     // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-    for (int i = MTLGPUFamilyApple9 + 10; i >= MTLGPUFamilyApple1; --i) {
+    for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
         if ([ctx->device supportsFamily:i]) {
             GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
             break;

From a1202a31ed8c35705bd09fe91c3e7410c619bd70 Mon Sep 17 00:00:00 2001
From: Johannes Rudolph <johannes.rudolph@gmail.com>
Date: Sun, 8 Oct 2023 12:21:19 +0200
Subject: [PATCH 07/26] k-quants : fix comments about block sizing (#3499)

---
 k_quants.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/k_quants.h b/k_quants.h
index adc6a391376d4..9de089e7a4719 100644
--- a/k_quants.h
+++ b/k_quants.h
@@ -29,7 +29,7 @@
 
 // 2-bit quantization
 // weight is represented as x = a * q + b
-// 16 blocks of 16 elemenets each
+// 16 blocks of 16 elements each
 // Effectively 2.5625 bits per weight
 typedef struct {
     uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
@@ -41,7 +41,7 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
 
 // 3-bit quantization
 // weight is represented as x = a * q
-// 16 blocks of 16 elemenets each
+// 16 blocks of 16 elements each
 // Effectively 3.4375 bits per weight
 #ifdef GGML_QKK_64
 typedef struct {
@@ -62,7 +62,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
 #endif
 
 // 4-bit quantization
-// 16 blocks of 32 elements each
+// 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
 // Effectively 4.5 bits per weight
 #ifdef GGML_QKK_64
@@ -83,7 +83,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
 #endif
 
 // 5-bit quantization
-// 16 blocks of 32 elements each
+// 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
 // Effectively 5.5 bits per weight
 #ifdef GGML_QKK_64
@@ -107,7 +107,7 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
 
 // 6-bit quantization
 // weight is represented as x = a * q
-// 16 blocks of 16 elemenets each
+// 16 blocks of 16 elements each
 // Effectively 6.5625 bits per weight
 typedef struct {
     uint8_t ql[QK_K/2];      // quants, lower 4 bits

From 9c38d181d40b9d27f8f42152c18e7c70bfffcf37 Mon Sep 17 00:00:00 2001
From: arcrank <arcrank@gmail.com>
Date: Sun, 8 Oct 2023 06:52:57 -0400
Subject: [PATCH 08/26] api_like_OAI.py : simplify function (#2796)

Simplify function
---
 examples/server/api_like_OAI.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py
index ed19237b0b3e5..1b0bf575706ea 100755
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@@ -23,13 +23,7 @@
 args = parser.parse_args()
 
 def is_present(json, key):
-    try:
-        buf = json[key]
-    except KeyError:
-        return False
-    return True
-
-
+    return key in json
 
 #convert chat to prompt
 def convert_chat(messages):

From 8e6716a102e390e930594d51302730184dac83cc Mon Sep 17 00:00:00 2001
From: Ryder Wishart <ryderwishart@gmail.com>
Date: Sun, 8 Oct 2023 03:55:58 -0700
Subject: [PATCH 09/26] api_like_OAI.py : compat with Microsoft Guidance
 (#2746)

Check for None in addition to empty string check in all request params

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/server/api_like_OAI.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py
index 1b0bf575706ea..14d2dcf6580e4 100755
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@@ -23,7 +23,13 @@
 args = parser.parse_args()
 
 def is_present(json, key):
-    return key in json
+    try:
+        buf = json[key]
+    except KeyError:
+        return False
+    if json[key] == None:
+        return False
+    return True
 
 #convert chat to prompt
 def convert_chat(messages):

From eee42c670e6fa6df9cf17e7ffc319f74cbd81354 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matheus=20C=2E=20Fran=C3=A7a?=
 <matheus-catarino@hotmail.com>
Date: Sun, 8 Oct 2023 10:59:20 -0300
Subject: [PATCH 10/26] ci : add Zig CI/CD and fix build (#2996)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* zig CI/CD and fix build

Signed-off-by: Matheus Catarino França <matheus-catarino@hotmail.com>

* fix build_compiler

* ci : remove trailing whitespace

---------

Signed-off-by: Matheus Catarino França <matheus-catarino@hotmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .github/workflows/zig-build.yml | 25 +++++++++++++++++++++++++
 build.zig                       | 31 ++++++++++++++++++++++++-------
 2 files changed, 49 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/zig-build.yml

diff --git a/.github/workflows/zig-build.yml b/.github/workflows/zig-build.yml
new file mode 100644
index 0000000000000..68a698ab94f55
--- /dev/null
+++ b/.github/workflows/zig-build.yml
@@ -0,0 +1,25 @@
+name: Zig CI
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on: [ubuntu-latest, macos-latest, windows-latest]
+    runs-on: ${{ matrix.runs-on }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: recursive
+          fetch-depth: 0
+      - uses: goto-bus-stop/setup-zig@v2
+        with:
+          version: 0.11.0
+      - name: Build Summary
+        run: zig build --summary all -freference-trace
diff --git a/build.zig b/build.zig
index b95491e03f12f..c86e4c667cf30 100644
--- a/build.zig
+++ b/build.zig
@@ -36,14 +36,17 @@ const Maker = struct {
     }
 
     fn init(builder: *std.build.Builder) !Maker {
-        // const commit_hash = @embedFile(".git/refs/heads/master");
         const target = builder.standardTargetOptions(.{});
+        const zig_version = @import("builtin").zig_version_string;
+        const commit_hash = try std.ChildProcess.exec(
+            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
+        );
         const config_header = builder.addConfigHeader(
             .{ .style = .blank, .include_path = "build-info.h" },
             .{
                 .BUILD_NUMBER = 0,
-                .BUILD_COMMIT = "12345", // omit newline
-                .BUILD_COMPILER = "Zig 0.11.0",
+                .BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
+                .BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
                 .BUILD_TARGET = try target.allocDescription(builder.allocator),
             },
         );
@@ -67,12 +70,20 @@ const Maker = struct {
 
     fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
         const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        if (o.target.getAbi() != .msvc)
+            o.defineCMacro("_GNU_SOURCE", null);
+        o.addConfigHeader(m.config_header);
         if (std.mem.endsWith(u8, src, ".c")) {
             o.addCSourceFiles(&.{src}, m.cflags.items);
             o.linkLibC();
         } else {
             o.addCSourceFiles(&.{src}, m.cxxflags.items);
-            o.linkLibCpp();
+            if (o.target.getAbi() == .msvc) {
+                o.linkLibC(); // need winsdk + crt
+            } else {
+                // linkLibCpp already add (libc++ + libunwind + libc)
+                o.linkLibCpp();
+            }
         }
         o.addConfigHeader(m.config_header);
         for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
@@ -86,8 +97,14 @@ const Maker = struct {
         for (deps) |d| e.addObject(d);
         for (m.objs.items) |o| e.addObject(o);
         for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
-        e.linkLibC();
-        e.linkLibCpp();
+
+        // https://github.com/ziglang/zig/issues/15448
+        if (e.target.getAbi() == .msvc) {
+            e.linkLibC(); // need winsdk + crt
+        } else {
+            // linkLibCpp already add (libc++ + libunwind + libc)
+            e.linkLibCpp();
+        }
         e.addConfigHeader(m.config_header);
         m.builder.installArtifact(e);
         e.want_lto = m.enable_lto;
@@ -109,7 +126,7 @@ pub fn build(b: *std.build.Builder) !void {
     const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
     const llama = make.obj("llama", "llama.cpp");
     const common = make.obj("common", "common/common.cpp");
-    const console = make.obj("common", "common/console.cpp");
+    const console = make.obj("console", "common/console.cpp");
     const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
     const train = make.obj("train", "common/train.cpp");
 

From db3abcc114d5d1790ba814aa1a80ac673d4ccc3e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 8 Oct 2023 20:19:14 +0300
Subject: [PATCH 11/26] sync : ggml (ggml-backend) (#3548)

* sync : ggml (ggml-backend)

ggml-ci

* zig : add ggml-backend to the build
---
 CMakeLists.txt       |   2 +
 Makefile             |   7 +-
 build.zig            |  15 +-
 ggml-alloc.c         | 169 +++++---------
 ggml-alloc.h         |  16 +-
 ggml-backend.c       | 385 +++++++++++++++++++++++++++++++
 ggml-backend.h       | 143 ++++++++++++
 ggml-cuda.cu         | 535 ++++++++++++++++++++++++++++++++++++-------
 ggml-cuda.h          |   4 +
 ggml-metal.h         |  19 +-
 ggml-metal.m         | 137 +++++++++++
 ggml.c               |  37 +--
 ggml.h               |  16 +-
 llama.cpp            |  44 ++--
 scripts/sync-ggml.sh |  24 +-
 15 files changed, 1285 insertions(+), 268 deletions(-)
 create mode 100644 ggml-backend.c
 create mode 100644 ggml-backend.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c79ec48609b0..9184eda8f2988 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -663,6 +663,8 @@ add_library(ggml OBJECT
             ggml.h
             ggml-alloc.c
             ggml-alloc.h
+            ggml-backend.c
+            ggml-backend.h
             ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
             ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
             ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
diff --git a/Makefile b/Makefile
index b8b0d4b562512..40187c4a25e62 100644
--- a/Makefile
+++ b/Makefile
@@ -512,9 +512,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
 ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
-OBJS += ggml-alloc.o
+ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
+	$(CC)  $(CFLAGS)   -c $< -o $@
+
+OBJS += ggml-alloc.o ggml-backend.o
 
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 common.o: common/common.cpp common/common.h build-info.h common/log.h
diff --git a/build.zig b/build.zig
index c86e4c667cf30..fdc5bc084eb46 100644
--- a/build.zig
+++ b/build.zig
@@ -124,20 +124,21 @@ pub fn build(b: *std.build.Builder) !void {
 
     const ggml = make.obj("ggml", "ggml.c");
     const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
+    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
     const llama = make.obj("llama", "llama.cpp");
     const common = make.obj("common", "common/common.cpp");
     const console = make.obj("console", "common/console.cpp");
     const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
     const train = make.obj("train", "common/train.cpp");
 
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, llama, common, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common, train });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
 
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, grammar_parser });
     if (server.target.isWindows()) {
         server.linkSystemLibrary("ws2_32");
     }
diff --git a/ggml-alloc.c b/ggml-alloc.c
index 805759db74fef..3321f05e2ef94 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -1,4 +1,5 @@
 #include "ggml-alloc.h"
+#include "ggml-backend.h"
 #include "ggml.h"
 #include <assert.h>
 #include <stdarg.h>
@@ -6,25 +7,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#ifdef __has_include
-    #if __has_include(<unistd.h>)
-        #include <unistd.h>
-        #if defined(_POSIX_MAPPED_FILES)
-            #include <sys/types.h>
-            #include <sys/mman.h>
-        #endif
-    #endif
-#endif
-
-#if defined(_WIN32)
-    #define WIN32_LEAN_AND_MEAN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #include <windows.h>
-    #include <memoryapi.h>
-#endif
-
 
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -80,8 +62,9 @@ struct free_block {
 #define MAX_FREE_BLOCKS 256
 
 struct ggml_allocr {
+    struct ggml_backend_buffer * buffer;
+    bool buffer_owned;
     void * data;
-    size_t size;
     size_t alignment;
     int n_free_blocks;
     struct free_block free_blocks[MAX_FREE_BLOCKS];
@@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
 }
 #endif
 
-static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
-    return ggml_nbytes(tensor);
-
-    UNUSED(alloc);
-}
-
 // check if a tensor is allocated by this buffer
 static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
-    void * ptr = tensor->data;
-    return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
+    return tensor->buffer == alloc->buffer;
 }
 
 static bool ggml_is_view(struct ggml_tensor * t) {
@@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
 }
 
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
-#ifdef GGML_ALLOCATOR_DEBUG
     GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
     GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
-#endif
-    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+
+    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
 
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 
     tensor->data = addr;
     AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
+    tensor->buffer = alloc->buffer;
+    ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
 
 #ifdef GGML_ALLOCATOR_DEBUG
     add_allocated_tensor(alloc, tensor);
@@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 
 // this is a very naive implementation, but for our case the number of free blocks should be very small
 static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
-    void * ptr = tensor->data;
-
     if (ggml_allocr_is_own(alloc, tensor) == false) {
         // the tensor was not allocated in this buffer
         // this can happen because the graph allocator will try to free weights and other tensors from different buffers
         // the easiest way to deal with this is just to ignore it
+        AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
         return;
     }
 
-    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+    void * ptr = tensor->data;
+
+    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
-    AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
+
+    ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
 
 #ifdef GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, tensor);
@@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
     alloc->n_free_blocks = 1;
     size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
     alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
-    alloc->free_blocks[0].size = alloc->size - align_offset;
+    alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
 }
 
 struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
-    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
+
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
 
     *alloc = (struct ggml_allocr){
-        /*.data          = */ data,
-        /*.size          = */ size,
+        /*.buffer        = */ buffer,
+        /*.buffer_owned  = */ true,
+        /*.base          = */ ggml_backend_buffer_get_base(buffer),
         /*.alignment     = */ alignment,
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
@@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
     return alloc;
 }
 
-// OS specific functions to allocate and free uncommitted virtual memory
-static void * alloc_vmem(size_t size) {
-#if defined(_WIN32)
-    return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
-#elif defined(_POSIX_MAPPED_FILES)
-    void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
-    if (ptr == MAP_FAILED) {
-        return NULL;
-    }
-    return ptr;
-#else
-    // use a fixed address for other platforms
-    uintptr_t base_addr = (uintptr_t)-size - 0x100;
-    return (void *)base_addr;
-#endif
-}
-
-static void free_vmem(void * base_addr, size_t size) {
-#if defined(_WIN32)
-    VirtualFree(base_addr, 0, MEM_RELEASE);
-    UNUSED(size);
-#elif defined(_POSIX_MAPPED_FILES)
-    munmap(base_addr, size);
-#else
-    // nothing to do
-    UNUSED(base_addr);
-    UNUSED(size);
-#endif
-}
-
-// allocate uncommitted virtual memory to measure the size of the graph
-static void alloc_measure_vmem(void ** base_addr, size_t * size) {
-    // 128GB for 64-bit, 1GB for 32-bit
-    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
-    do {
-        *base_addr = alloc_vmem(*size);
-        if (*base_addr != NULL) {
-            AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
-            return;
-        }
-        // try again with half the size
-        *size /= 2;
-    } while (*size > 0);
-
-    GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
-}
-
-static void free_measure_vmem(void * base_addr, size_t size) {
-    free_vmem(base_addr, size);
-}
-
 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
-    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+    struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
+    alloc->measure = true;
 
-    void * base_addr;
-    size_t size;
+    return alloc;
+}
 
-    alloc_measure_vmem(&base_addr, &size);
+struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
 
     *alloc = (struct ggml_allocr){
-        /*.data          = */ base_addr,
-        /*.size          = */ size,
-        /*.alignment     = */ alignment,
+        /*.buffer        = */ buffer,
+        /*.buffer_owned  = */ false,
+        /*.base          = */ ggml_backend_buffer_get_base(buffer),
+        /*.alignment     = */ ggml_backend_buffer_get_alignment(buffer),
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
         /*.hash_table    = */ {{0}},
         /*.max_size      = */ 0,
-        /*.measure       = */ true,
+        /*.measure       = */ false,
         /*.parse_seq     = */ {0},
         /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
 }
 
 void ggml_allocr_free(struct ggml_allocr * alloc) {
-    if (alloc->measure) {
-        free_measure_vmem(alloc->data, alloc->size);
+    if (alloc->buffer_owned) {
+        ggml_backend_buffer_free(alloc->buffer);
     }
     free(alloc);
 }
@@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
         case GGML_OP_ROPE:
         case GGML_OP_RMS_NORM:
         case GGML_OP_SOFT_MAX:
-        case GGML_OP_CONT:
             return true;
 
         default:
@@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
     }
 }
 
+static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
+    assert(view->view_src != NULL && view->view_src->data != NULL);
+    view->backend = view->view_src->backend;
+    view->buffer  = view->view_src->buffer;
+    view->data    = (char *)view->view_src->data + view->view_offs;
+
+    // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
+    // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
+    assert(ggml_allocr_is_measure(alloc) || view->buffer->backend == alloc->buffer->backend);
+    ggml_backend_buffer_init_tensor(alloc->buffer, view);
+}
+
 static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
     struct hash_node * ht = alloc->hash_table;
     if (node->data == NULL) {
         if (ggml_is_view(node)) {
-            assert(node->view_src->data != NULL);
-            node->data = (char *)node->view_src->data + node->view_offs;
+            init_view(alloc, node);
         } else {
             // see if we can reuse a parent's buffer (inplace)
             if (ggml_op_can_inplace(node->op)) {
@@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                                 // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
                                 // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
                                 AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                                node->data = parent->data;
+                                node->view_src = view_src;
+                                view_src_hn->n_views += 1;
+                                init_view(alloc, node);
                                 return;
                             }
                         }
                         else {
                             AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
-                            node->data = parent->data;
+                            node->view_src = parent;
+                            p_hn->n_views += 1;
+                            init_view(alloc, node);
                             return;
                         }
                     }
@@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
     }
 }
 
-static size_t ggml_allocr_alloc_graph_tensors_n(
+size_t ggml_allocr_alloc_graph_n(
     struct ggml_allocr * alloc,
     struct ggml_cgraph ** graphs, int n_graphs,
     struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
             if (ggml_is_view(node)) {
                 struct ggml_tensor * view_src = node->view_src;
                 hash_get(ht, view_src)->n_views += 1;
+                if (node->buffer == NULL && node->data != NULL) {
+                    // view of a pre-allocated tensor, didn't call init_view() yet
+                    init_view(alloc, node);
+                }
             }
 
             for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
                     break;
                 }
                 hash_get(ht, parent)->n_children += 1;
+                if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
+                    init_view(alloc, parent);
+                }
             }
         }
     }
@@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
 }
 
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
-    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+    return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
 }
 
 size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
diff --git a/ggml-alloc.h b/ggml-alloc.h
index 0c224f174f396..e38758878b91a 100644
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@@ -6,21 +6,27 @@
 extern "C" {
 #endif
 
+struct ggml_backend_buffer;
 
 GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
 GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
+GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
 
 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
 GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
 
-GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
-GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
-GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
-GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
+GGML_API void   ggml_allocr_free       (struct ggml_allocr * alloc);
+GGML_API bool   ggml_allocr_is_measure (struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_reset      (struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_alloc      (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
 GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
-GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
+GGML_API size_t ggml_allocr_max_size   (struct ggml_allocr * alloc);
 
+GGML_API size_t ggml_allocr_alloc_graph_n(
+                    struct ggml_allocr * alloc,
+                    struct ggml_cgraph ** graphs, int n_graphs,
+                    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml-backend.c b/ggml-backend.c
new file mode 100644
index 0000000000000..ca8d83dafe47c
--- /dev/null
+++ b/ggml-backend.c
@@ -0,0 +1,385 @@
+#include "ggml-backend.h"
+#include "ggml-alloc.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define UNUSED GGML_UNUSED
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+// backend buffer
+
+ggml_backend_buffer_t ggml_backend_buffer_init(
+        struct ggml_backend                  * backend,
+        struct ggml_backend_buffer_i           iface,
+               ggml_backend_buffer_context_t   context,
+               size_t                          size) {
+    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
+
+    GGML_ASSERT(iface.get_base != NULL);
+
+    (*buffer) = (struct ggml_backend_buffer) {
+        /* .interface = */ iface,
+        /* .backend   = */ backend,
+        /* .context   = */ context,
+        /* .size      = */ size,
+    };
+
+    return buffer;
+}
+
+void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
+    if (buffer->iface.free_buffer != NULL) {
+        buffer->iface.free_buffer(buffer);
+    }
+    free(buffer);
+}
+
+size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
+    return ggml_backend_get_alignment(buffer->backend);
+}
+
+void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return buffer->iface.get_base(buffer);
+}
+
+size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
+    return buffer->size;
+}
+
+size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    if (buffer->iface.get_alloc_size) {
+        return buffer->iface.get_alloc_size(buffer, tensor);
+    }
+    return ggml_nbytes(tensor);
+}
+
+void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    if (buffer->iface.init_tensor) {
+        buffer->iface.init_tensor(buffer, tensor);
+    }
+}
+
+void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    if (buffer->iface.free_tensor) {
+        buffer->iface.free_tensor(buffer, tensor);
+    }
+}
+
+// backend
+
+ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
+    return tensor->buffer->backend;
+}
+
+const char * ggml_backend_name(ggml_backend_t backend) {
+    return backend->iface.get_name(backend);
+}
+
+void ggml_backend_free(ggml_backend_t backend) {
+    backend->iface.free(backend);
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
+    return backend->iface.alloc_buffer(backend, size);
+}
+
+size_t ggml_backend_get_alignment(ggml_backend_t backend) {
+    return backend->iface.get_alignment(backend);
+}
+
+void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+}
+
+void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+}
+
+void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+    ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
+}
+
+void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+    ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
+}
+
+void ggml_backend_synchronize(ggml_backend_t backend) {
+    backend->iface.synchronize(backend);
+}
+
+ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    return backend->iface.graph_plan_create(backend, cgraph);
+}
+
+void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    backend->iface.graph_plan_free(backend, plan);
+}
+
+void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    backend->iface.graph_plan_compute(backend, plan);
+}
+
+void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    backend->iface.graph_compute(backend, cgraph);
+}
+
+bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    return backend->iface.supports_op(backend, op);
+}
+
+// backend copy
+
+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
+    //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
+    //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
+    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
+
+    // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
+
+    if (src == dst) {
+        return;
+    }
+
+    // TODO: allow backends to support copy to/from same backend
+
+    if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
+        ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
+    } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
+        ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
+    } else {
+        // shouldn't be hit when copying from/to CPU
+        #ifndef NDEBUG
+        fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
+        #endif
+        size_t nbytes = ggml_nbytes(src);
+        void * data = malloc(nbytes);
+        ggml_backend_tensor_get(src, data, 0, nbytes);
+        ggml_backend_tensor_set(dst, data, 0, nbytes);
+        free(data);
+    }
+}
+
+// backend CPU
+
+struct ggml_backend_cpu_context {
+    int n_threads;
+    void * work_data;
+    size_t work_size;
+};
+
+static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
+    return "CPU";
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_free(ggml_backend_t backend) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+    free(cpu_ctx->work_data);
+    free(cpu_ctx);
+    free(backend);
+}
+
+static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *)buffer->context;
+}
+
+static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+    UNUSED(buffer);
+}
+
+static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
+    /* .free_buffer    = */ ggml_backend_cpu_buffer_free_buffer,
+    /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
+    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+    /* .init_tensor    = */ NULL, // no initialization required
+    /* .free_tensor    = */ NULL, // no cleanup required
+};
+
+// for buffers from ptr, free is not called
+static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
+    /* .free_buffer    = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
+    /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
+    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+    /* .init_tensor    = */ NULL,
+    /* .free_tensor    = */ NULL,
+};
+
+static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
+
+static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
+    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
+    void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
+
+    return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
+}
+
+static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
+    return TENSOR_ALIGNMENT;
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy((char *)tensor->data + offset, data, size);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
+    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
+
+    UNUSED(backend);
+}
+
+struct ggml_backend_plan_cpu {
+    struct ggml_cplan cplan;
+    struct ggml_cgraph cgraph;
+};
+
+static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
+
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    cpu_plan->cgraph = *cgraph;
+
+    if (cpu_plan->cplan.work_size > 0) {
+        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
+    }
+
+    return cpu_plan;
+}
+
+static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+    free(cpu_plan->cplan.work_data);
+    free(cpu_plan);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+    ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+
+    if (cpu_ctx->work_size < cplan.work_size) {
+        // TODO: may be faster to free and use malloc to avoid the copy
+        cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
+        cpu_ctx->work_size = cplan.work_size;
+    }
+
+    cplan.work_data = cpu_ctx->work_data;
+
+    ggml_graph_compute(cgraph, &cplan);
+}
+
+static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    return true;
+    UNUSED(backend);
+    UNUSED(op);
+}
+
+static struct ggml_backend_i cpu_backend_i = {
+    /* .get_name            = */ ggml_backend_cpu_name,
+    /* .free                = */ ggml_backend_cpu_free,
+    /* .alloc_buffer        = */ ggml_backend_cpu_alloc_buffer,
+    /* .get_alignment       = */ ggml_backend_cpu_get_alignment,
+    /* .set_tensor_async    = */ ggml_backend_cpu_set_tensor_async,
+    /* .get_tensor_async    = */ ggml_backend_cpu_get_tensor_async,
+    /* .synchronize         = */ ggml_backend_cpu_synchronize,
+    /* .cpy_tensor_from     = */ ggml_backend_cpu_cpy_tensor_from,
+    /* .cpy_tensor_to       = */ ggml_backend_cpu_cpy_tensor_to,
+    /* .graph_plan_create   = */ ggml_backend_cpu_graph_plan_create,
+    /* .graph_plan_free     = */ ggml_backend_cpu_graph_plan_free,
+    /* .graph_plan_compute  = */ ggml_backend_cpu_graph_plan_compute,
+    /* .graph_compute       = */ ggml_backend_cpu_graph_compute,
+    /* .supports_op         = */ ggml_backend_cpu_supports_op,
+};
+
+ggml_backend_t ggml_backend_cpu_init(void) {
+    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
+
+    ctx->n_threads = GGML_DEFAULT_N_THREADS;
+    ctx->work_data = NULL;
+    ctx->work_size = 0;
+
+    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
+
+    *cpu_backend = (struct ggml_backend) {
+        /* .interface = */ cpu_backend_i,
+        /* .context   = */ ctx
+    };
+    return cpu_backend;
+}
+
+bool ggml_backend_is_cpu(ggml_backend_t backend) {
+    return backend->iface.get_name == ggml_backend_cpu_name;
+}
+
+void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->n_threads = n_threads;
+}
+
+ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
+    return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
+}
diff --git a/ggml-backend.h b/ggml-backend.h
new file mode 100644
index 0000000000000..da134b0dbed51
--- /dev/null
+++ b/ggml-backend.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+    struct ggml_backend;
+    struct ggml_backend_buffer;
+
+    // type-erased backend-specific types / wrappers
+    typedef void * ggml_backend_context_t;
+    typedef void * ggml_backend_graph_plan_t;
+    typedef void * ggml_backend_buffer_context_t;
+
+    // avoid accessing internals of these types
+    typedef struct ggml_backend        * ggml_backend_t;
+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+
+    //
+    // backend buffer
+    //
+
+    struct ggml_backend_buffer_i {
+        void   (*free_buffer)   (ggml_backend_buffer_t buffer);
+        void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
+        size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
+        void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
+        void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
+    };
+
+    // TODO: hide behind API
+    struct ggml_backend_buffer {
+        struct ggml_backend_buffer_i iface;
+
+        ggml_backend_t                backend;
+        ggml_backend_buffer_context_t context;
+
+        size_t size;
+    };
+
+    // backend buffer functions
+    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
+            struct ggml_backend                  * backend,
+            struct ggml_backend_buffer_i           iface,
+                   ggml_backend_buffer_context_t   context,
+                   size_t                          size);
+
+    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_free_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+
+    //
+    // backend
+    //
+
+    struct ggml_backend_i {
+        const char * (*get_name)(ggml_backend_t backend);
+
+        void (*free)(ggml_backend_t backend);
+
+        // buffer allocation
+        ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
+
+        // get buffer alignment
+        size_t (*get_alignment)(ggml_backend_t backend);
+
+        // tensor data access
+        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
+        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        void (*synchronize)     (ggml_backend_t backend);
+
+        // (optional) copy tensor between different backends, allow for single-copy tranfers
+        void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+        // compute graph with a plan
+        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+        // compute graph without a plan
+        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+        // check if the backend supports an operation
+        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+    };
+
+    // TODO: hide behind API
+    struct ggml_backend {
+        struct ggml_backend_i iface;
+
+        ggml_backend_context_t context;
+    };
+
+    // backend helper functions
+    GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
+
+    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
+    GGML_API void         ggml_backend_free(ggml_backend_t backend);
+
+    GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+
+    GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
+
+    GGML_API void ggml_backend_tensor_set_async(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
+
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
+
+    // tensor copy between different backends
+    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    //
+    // CPU backend
+    //
+
+    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+
+    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
+
+    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+
+    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 989c419cd0ea4..7e92c519741b9 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -62,6 +62,7 @@
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -419,6 +420,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
 #define CUDA_QUANTIZE_BLOCK_SIZE 256
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
+#define CUDA_GET_ROWS_BLOCK_SIZE 256
 
 // dmmv = dequantize_mul_mat_vec
 #ifndef GGML_CUDA_DMMV_X
@@ -1574,6 +1576,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
     reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }
 
+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
+    const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int r = y[row];
+
+    // copy x[r*ncols + col] to dst[row*ncols + col]
+    const int xi = r*ncols + col;
+    const int di = row*ncols + col;
+
+    const int ib = xi/qk; // block index
+    const int iqs = (xi%qk)/qr; // quant index
+    const int iybs = di - di%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(x, ib, iqs, v);
+
+    dst[iybs + iqs + 0]        = v.x;
+    dst[iybs + iqs + y_offset] = v.y;
+}
+
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
     const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -4555,6 +4585,15 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
     dst[i] = scale * x[i];
 }
 
+
+template<int qk, int qr, dequantize_kernel_t dq>
+static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
+    const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+    const dim3 block_nums(block_num_x, nrows, 1);
+    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
+}
+
 static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
     const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
     add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5703,7 +5742,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
     } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
         GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
         kind = cudaMemcpyDeviceToDevice;
-        struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
+        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
         int id;
         CUDA_CHECK(cudaGetDevice(&id));
         src_ptr = (char *) extra->data_device[id];
@@ -5739,6 +5778,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
     }
 }
 
+static void ggml_cuda_op_repeat(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const int64_t ne3 = dst->ne[3];
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+    const size_t nb2 = dst->nb[2];
+    const size_t nb3 = dst->nb[3];
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+    const size_t nb02 = src0->nb[2];
+    const size_t nb03 = src0->nb[3];
+
+    const int nr0 = (int)(ne0/ne00);
+    const int nr1 = (int)(ne1/ne01);
+    const int nr2 = (int)(ne2/ne02);
+    const int nr3 = (int)(ne3/ne03);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
+    for                         (int i3 = 0; i3 < nr3;  i3++) {
+        for                     (int k3 = 0; k3 < ne03; k3++) {
+            for                 (int i2 = 0; i2 < nr2;  i2++) {
+                for             (int k2 = 0; k2 < ne02; k2++) {
+                    for         (int i1 = 0; i1 < nr1;  i1++) {
+                        for     (int k1 = 0; k1 < ne01; k1++) {
+                            for (int i0 = 0; i0 < nr0;  i0++) {
+                                CUDA_CHECK(cudaMemcpyAsync(
+                                              (char *)  dst_d + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0,
+                                        (const char *) src0_d + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01,
+                                        ne00*nb0, cudaMemcpyDeviceToDevice, stream));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    (void) src1;
+    (void) src1_d;
+}
+
+static void ggml_cuda_op_get_rows(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    const int ncols = src0->ne[0];
+    const int nrows = ggml_nelements(src1);
+
+    const int32_t * src1_i32 = (const int32_t *) src1_d;
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_F32:
+            get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q4_0:
+            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        default:
+            // TODO: k-quants
+            GGML_ASSERT(false);
+            break;
+    }
+}
+
 inline void ggml_cuda_op_add(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
     const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6343,7 +6483,14 @@ inline void ggml_cuda_op_scale(
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    const float scale = ((float *) src1->data)[0];
+    float scale;
+    // HACK: support for ggml backend interface
+    if (src1->backend == GGML_BACKEND_CPU) {
+        scale = ((float *) src1->data)[0];
+    } else {
+        // TODO: pass pointer to kernel instead of copying to host
+        CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
+    }
 
     scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
     CUDA_CHECK(cudaGetLastError());
@@ -6362,9 +6509,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
     GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
     GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
 
-    struct ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
-    struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
-    struct ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
+    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
 
     const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
     const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
@@ -6505,9 +6652,9 @@ static void ggml_cuda_op_mul_mat(
     const size_t q8_1_ts = sizeof(block_q8_1);
     const size_t q8_1_bs = QK8_1;
 
-    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    struct ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
 
     const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
     const bool src0_is_contiguous = ggml_is_contiguous(src0);
@@ -6585,7 +6732,7 @@ static void ggml_cuda_op_mul_mat(
         if (convert_src1_to_q8_1) {
             src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
 
-            if (split && src1_on_device && src1_is_contiguous) {
+            if (src1_on_device && src1_is_contiguous) {
                 quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
                 CUDA_CHECK(cudaGetLastError());
             }
@@ -6667,7 +6814,7 @@ static void ggml_cuda_op_mul_mat(
                     GGML_ASSERT(false);
                 }
 
-                if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
+                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
                     quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                     CUDA_CHECK(cudaGetLastError());
                 }
@@ -6758,6 +6905,14 @@ static void ggml_cuda_op_mul_mat(
     }
 }
 
+static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
+}
+
+static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
+}
+
 static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
 }
@@ -6812,13 +6967,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 
-    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     void * src0_ddq = src0_extra->data_device[g_main_device];
 
-    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
 
-    struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
 
     ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
@@ -6843,13 +6998,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 
-    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     void * src0_ddq = src0_extra->data_device[g_main_device];
 
-    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
 
-    struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
 
     const int64_t row_stride_x = nb01 / sizeof(half);
@@ -6870,11 +7025,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         }
     }
 
-    if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
     } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    }else if (src0->type == GGML_TYPE_F32) {
+    } else if (src0->type == GGML_TYPE_F32) {
         ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
     } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
         if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
@@ -6935,8 +7090,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 
-    const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
 
     char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
     char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
@@ -6991,8 +7146,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
 
     const size_t nb1 = tensor->nb[1];
 
-    ggml_backend backend = tensor->backend;
-    struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
+    ggml_backend_type backend = tensor->backend;
+    ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
     memset(extra, 0, sizeof(*extra));
 
     for (int64_t id = 0; id < g_device_count; ++id) {
@@ -7046,7 +7201,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
             CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
         }
 
-
         CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
 
         extra->data_device[id] = buf;
@@ -7085,17 +7239,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
     delete extra;
 }
 
-static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
 static size_t g_temp_tensor_extra_index = 0;
 
-static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
     if (g_temp_tensor_extras == nullptr) {
         g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
     }
 
     size_t alloc_index = g_temp_tensor_extra_index;
     g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
-    struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+    ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
     memset(extra, 0, sizeof(*extra));
 
     return extra;
@@ -7123,7 +7277,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
         return;
     }
 
-    struct ggml_tensor_extra_gpu * extra;
+    ggml_tensor_extra_gpu * extra;
 
     const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         tensor->op == GGML_OP_VIEW ||
@@ -7132,7 +7286,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
 
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
         size_t offset = 0;
         if (tensor->op == GGML_OP_VIEW) {
@@ -7141,7 +7295,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
         extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = src0_ddc + offset;
     } else if (tensor->op == GGML_OP_CPY) {
-        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
+        ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
         void * src1_ddv = src1_extra->data_device[g_main_device];
         extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = src1_ddv;
@@ -7183,13 +7337,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
         CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
     }
 
-    struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
+    ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
 
     const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         tensor->op == GGML_OP_VIEW;
 
     if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
         size_t view_offset = 0;
         if (tensor->op == GGML_OP_VIEW) {
@@ -7207,7 +7361,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
     GGML_ASSERT(ggml_is_contiguous(tensor));
 
-    struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
 }
@@ -7264,58 +7418,47 @@ void ggml_cuda_free_scratch() {
     g_scratch_buffer = nullptr;
 }
 
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     ggml_cuda_func_t func;
     const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
         || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
         || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
 
+    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
+        return false;
+    }
+
     switch (tensor->op) {
+        case GGML_OP_REPEAT:
+            func = ggml_cuda_repeat;
+            break;
+        case GGML_OP_GET_ROWS:
+            func = ggml_cuda_get_rows;
+            break;
         case GGML_OP_DUP:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_dup;
             break;
         case GGML_OP_ADD:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_add;
             break;
         case GGML_OP_MUL:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_mul;
             break;
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(tensor)) {
                 case GGML_UNARY_OP_GELU:
-                    if (!any_on_device) {
-                        return false;
-                    }
                     func = ggml_cuda_gelu;
                     break;
                 case GGML_UNARY_OP_SILU:
-                    if (!any_on_device) {
-                        return false;
-                    }
                     func = ggml_cuda_silu;
                     break;
                 default:
                     return false;
             } break;
         case GGML_OP_NORM:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_norm;
             break;
         case GGML_OP_RMS_NORM:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_rms_norm;
             break;
         case GGML_OP_MUL_MAT:
@@ -7325,54 +7468,30 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             func = ggml_cuda_mul_mat;
             break;
         case GGML_OP_SCALE:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_scale;
             break;
         case GGML_OP_CPY:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_cpy;
             break;
         case GGML_OP_CONT:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_dup;
             break;
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_nop;
             break;
         case GGML_OP_DIAG_MASK_INF:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_diag_mask_inf;
             break;
         case GGML_OP_SOFT_MAX:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_soft_max;
             break;
         case GGML_OP_ROPE:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_rope;
             break;
         case GGML_OP_ALIBI:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_alibi;
             break;
         default:
@@ -7400,3 +7519,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
     CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
     snprintf(description, description_size, "%s", prop.name);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+#define UNUSED GGML_UNUSED
+
+struct ggml_backend_context_cuda {
+};
+
+static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
+    return GGML_CUDA_NAME;
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_free(ggml_backend_t backend) {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+    delete cuda_ctx;
+    delete backend;
+}
+
+struct ggml_backend_buffer_context_cuda {
+    void * device;
+
+    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
+    size_t temp_tensor_extra_index = 0;
+
+    ~ggml_backend_buffer_context_cuda() {
+        delete[] temp_tensor_extras;
+    }
+
+    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+        if (temp_tensor_extras == nullptr) {
+            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
+        }
+
+        size_t alloc_index = temp_tensor_extra_index;
+        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
+        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
+        memset(extra, 0, sizeof(*extra));
+
+        return extra;
+    }
+};
+
+static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    CUDA_CHECK(cudaFree(ctx->device));
+    delete ctx;
+}
+
+static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    return ctx->device;
+}
+
+static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    int64_t row_low = 0;
+    int64_t row_high = ggml_nrows(tensor);
+    int64_t nrows_split = row_high - row_low;
+
+    size_t size = ggml_nbytes_split(tensor, nrows_split);
+
+    int64_t ne0 = tensor->ne[0];
+
+    if (ggml_is_quantized(tensor->type)) {
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
+                * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+        }
+    }
+
+    return size;
+
+    UNUSED(buffer);
+}
+
+static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+        assert(tensor->view_src->buffer->backend == buffer->backend);
+        tensor->backend = tensor->view_src->backend;
+        tensor->extra = tensor->view_src->extra;
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
+
+    extra->data_device[g_main_device] = tensor->data;
+
+    tensor->backend = GGML_BACKEND_GPU;
+    tensor->extra = extra;
+
+    if (ggml_is_quantized(tensor->type)) {
+        // initialize padding to 0 to avoid possible NaN values
+        int64_t row_low = 0;
+        int64_t row_high = ggml_nrows(tensor);
+        int64_t nrows_split = row_high - row_low;
+
+        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
+        size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
+
+        if (padded_size > original_size && tensor->view_src == nullptr) {
+            CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
+        }
+    }
+
+    UNUSED(buffer);
+}
+
+static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
+    /* .free_buffer    = */ ggml_backend_cuda_buffer_free_buffer,
+    /* .get_base       = */ ggml_backend_cuda_buffer_get_base,
+    /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
+    /* .init_tensor    = */ ggml_backend_cuda_buffer_init_tensor,
+    /* .free_tensor    = */ NULL,
+};
+
+static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
+    ggml_cuda_set_device(g_main_device);
+
+    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
+    CUDA_CHECK(cudaMalloc(&ctx->device, size));
+    return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
+    return 128;
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+    CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+
+    UNUSED(backend);
+}
+
+static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    GGML_ASSERT(!"not implemented");
+
+    return nullptr;
+
+    UNUSED(backend);
+    UNUSED(cgraph);
+}
+
+static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(!"not implemented");
+
+    UNUSED(backend);
+    UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(!"not implemented");
+
+    UNUSED(backend);
+    UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_cuda_set_device(g_main_device);
+
+    ggml_compute_params params = {};
+    params.type = GGML_TASK_COMPUTE;
+    params.ith = 0;
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+
+        assert(node->backend == GGML_BACKEND_GPU);
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j] != nullptr) {
+                assert(node->src[j]->backend == GGML_BACKEND_GPU);
+            }
+        }
+
+        bool ok = ggml_cuda_compute_forward(&params, node);
+        if (!ok) {
+            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+        }
+        GGML_ASSERT(ok);
+
+#if 0
+        if (node->type == GGML_TYPE_F32) {
+            cudaDeviceSynchronize();
+            std::vector<float> tmp(ggml_nelements(node), 0.0f);
+            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
+            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
+                ggml_type_name(node->src[0]->type),
+                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
+                node->src[0]->name,
+                node->src[1] ? node->src[1]->name : "none");
+            double sum = 0.0;
+            double sq_sum = 0.0;
+            for (int i = 0; i < ggml_nelements(node); i++) {
+                printf("%f ", tmp[i]);
+                sum += tmp[i];
+                sq_sum += tmp[i]*tmp[i];
+            }
+            printf("\n");
+            printf("sum: %f, ", sum);
+            printf("sq_sum: %f\n", sq_sum);
+        }
+#endif
+    }
+
+    UNUSED(backend);
+}
+
+static ggml_backend_i cuda_backend_i = {
+    /* .get_name            = */ ggml_backend_cuda_name,
+    /* .free                = */ ggml_backend_cuda_free,
+    /* .alloc_buffer        = */ ggml_backend_cuda_alloc_buffer,
+    /* .get_alignment       = */ ggml_backend_cuda_get_alignment,
+    /* .set_tensor_async    = */ ggml_backend_cuda_set_tensor_async,
+    /* .get_tensor_async    = */ ggml_backend_cuda_get_tensor_async,
+    /* .synchronize         = */ ggml_backend_cuda_synchronize,
+    /* .cpy_tensor_from     = */ nullptr,
+    /* .cpy_tensor_to       = */ nullptr,
+    /* .graph_plan_create   = */ ggml_backend_cuda_graph_plan_create,
+    /* .graph_plan_free     = */ ggml_backend_cuda_graph_plan_free,
+    /* .graph_plan_compute  = */ ggml_backend_cuda_graph_plan_compute,
+    /* .graph_compute       = */ ggml_backend_cuda_graph_compute,
+    /* .supports_op         = */ nullptr,
+};
+
+ggml_backend_t ggml_backend_cuda_init() {
+    ggml_init_cublas(); // TODO: remove from ggml.c
+
+    ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
+
+    ggml_backend_t cuda_backend = new ggml_backend {
+        /* .interface = */ cuda_backend_i,
+        /* .context   = */ ctx
+    };
+
+    return cuda_backend;
+}
diff --git a/ggml-cuda.h b/ggml-cuda.h
index fda704b665623..57adc9cf34bc5 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "ggml.h"
+#include "ggml-backend.h"
 
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
@@ -42,6 +43,9 @@ GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, s
 GGML_API int    ggml_cuda_get_device_count(void);
 GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
 
+// backend API
+GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml-metal.h b/ggml-metal.h
index 790cf0bf7b963..096b844e32c6f 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -20,6 +20,7 @@
 #pragma once
 
 #include "ggml.h"
+#include "ggml-backend.h"
 
 #include <stddef.h>
 #include <stdbool.h>
@@ -35,10 +36,15 @@ struct ggml_cgraph;
 extern "C" {
 #endif
 
-void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+//
+// internal API
+// temporary exposed to user-code
+//
 
 struct ggml_metal_context;
 
+void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+
 // number of command buffers to use
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
@@ -83,6 +89,17 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 
+//
+// backend API
+// user-code should use only these functions
+//
+
+GGML_API ggml_backend_t ggml_backend_metal_init(void);
+
+GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
+
+GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml-metal.m b/ggml-metal.m
index 92956ed97e9c9..29cb3c922daeb 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1456,3 +1456,140 @@ void ggml_metal_graph_compute(
 
     }
 }
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+static const char * ggml_backend_metal_name(ggml_backend_t backend) {
+    return "Metal";
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_free(ggml_backend_t backend) {
+    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+    ggml_metal_free(ctx);
+    free(backend);
+}
+
+static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *)buffer->context;
+}
+
+static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+    UNUSED(buffer);
+}
+
+static struct ggml_backend_buffer_i metal_backend_buffer_i = {
+    /* .free_buffer    = */ ggml_backend_metal_buffer_free_buffer,
+    /* .get_base       = */ ggml_backend_metal_buffer_get_base,
+    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+    /* .init_tensor    = */ NULL, // no initialization required
+    /* .free_tensor    = */ NULL, // no cleanup required
+};
+
+static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
+    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+
+    void * data = ggml_metal_host_malloc(size);
+
+    // TODO: set proper name of the buffers
+    ggml_metal_add_buffer(ctx, "backend", data, size, 0);
+
+    return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
+}
+
+static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
+    return 32;
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy((char *)tensor->data + offset, data, size);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
+
+    ggml_metal_graph_compute(metal_ctx, cgraph);
+}
+
+static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    return true;
+    UNUSED(backend);
+    UNUSED(op);
+}
+
+static struct ggml_backend_i metal_backend_i = {
+    /* .get_name            = */ ggml_backend_metal_name,
+    /* .free                = */ ggml_backend_metal_free,
+    /* .alloc_buffer        = */ ggml_backend_metal_alloc_buffer,
+    /* .get_alignment       = */ ggml_backend_metal_get_alignment,
+    /* .set_tensor_async    = */ ggml_backend_metal_set_tensor_async,
+    /* .get_tensor_async    = */ ggml_backend_metal_get_tensor_async,
+    /* .synchronize         = */ ggml_backend_metal_synchronize,
+    /* .cpy_tensor_from     = */ ggml_backend_metal_cpy_tensor_from,
+    /* .cpy_tensor_to       = */ ggml_backend_metal_cpy_tensor_to,
+    /* .graph_plan_create   = */ NULL, // the metal implementation does not require creating graph plans atm
+    /* .graph_plan_free     = */ NULL,
+    /* .graph_plan_compute  = */ NULL,
+    /* .graph_compute       = */ ggml_backend_metal_graph_compute,
+    /* .supports_op         = */ ggml_backend_metal_supports_op,
+};
+
+ggml_backend_t ggml_backend_metal_init(void) {
+    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+
+    ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
+
+    ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
+
+    *metal_backend = (struct ggml_backend) {
+        /* .interface = */ metal_backend_i,
+        /* .context   = */ ctx,
+    };
+
+    return metal_backend;
+}
+
+bool ggml_backend_is_metal(ggml_backend_t backend) {
+    return backend->iface.get_name == ggml_backend_metal_name;
+}
+
+void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
+    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+
+    ggml_metal_set_n_cb(ctx, n_cb);
+}
diff --git a/ggml.c b/ggml.c
index 911a63988e027..6d1776ca46741 100644
--- a/ggml.c
+++ b/ggml.c
@@ -162,40 +162,16 @@ typedef void * thread_ret_t;
 
 #define GGML_PRINT(...) printf(__VA_ARGS__)
 
+//
+// end of logging block
+//
+
 #ifdef GGML_USE_ACCELERATE
 // uncomment to use vDSP for soft max computation
 // note: not sure if it is actually faster
 //#define GGML_SOFT_MAX_ACCELERATE
 #endif
 
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-//
-// end of logging block
-//
-
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
 #define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
@@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
     *result = (struct ggml_tensor) {
         /*.type         =*/ type,
         /*.backend      =*/ GGML_BACKEND_CPU,
+        /*.buffer       =*/ NULL,
         /*.n_dims       =*/ n_dims,
         /*.ne           =*/ { 1, 1, 1, 1 },
         /*.nb           =*/ { 0, 0, 0, 0 },
@@ -20203,6 +20180,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_vec_cpy_f32(nx, xp, x);
         ggml_vec_cpy_f32(nx, gp, g);
 
+        // TODO: instead of passing &cancel here, use the return code of the linesearch
+        //       to determine if the optimization should be cancelled
+        //       this is a simple change, but not doing this atm, since I don't have a nice
+        //       way to test and don't want to break something with so many changes lined up
         ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
         if (cancel) {
             return GGML_OPT_CANCEL;
diff --git a/ggml.h b/ggml.h
index a9d4e33d9885c..3eddc44b90fdd 100644
--- a/ggml.h
+++ b/ggml.h
@@ -326,7 +326,7 @@ extern "C" {
         GGML_TYPE_COUNT,
     };
 
-    enum ggml_backend {
+    enum ggml_backend_type {
         GGML_BACKEND_CPU = 0,
         GGML_BACKEND_GPU = 10,
         GGML_BACKEND_GPU_SPLIT = 20,
@@ -479,8 +479,10 @@ extern "C" {
 
     // n-dimensional tensor
     struct ggml_tensor {
-        enum ggml_type    type;
-        enum ggml_backend backend;
+        enum ggml_type         type;
+        enum ggml_backend_type backend;
+
+        struct ggml_backend_buffer * buffer;
 
         int     n_dims;
         int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -514,7 +516,7 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        char padding[4];
+        char padding[12];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -1358,7 +1360,7 @@ extern "C" {
 
     // alibi position embedding
     // in-place, returns view(a)
-    struct ggml_tensor * ggml_alibi(
+    GGML_API struct ggml_tensor * ggml_alibi(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             int                   n_past,
@@ -1367,7 +1369,7 @@ extern "C" {
 
     // clamp
     // in-place, returns view(a)
-    struct ggml_tensor * ggml_clamp(
+    GGML_API struct ggml_tensor * ggml_clamp(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             float                 min,
@@ -2102,7 +2104,7 @@ extern "C" {
         enum ggml_type    vec_dot_type;
     } ggml_type_traits_t;
 
-    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
+    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 
 #ifdef  __cplusplus
 }
diff --git a/llama.cpp b/llama.cpp
index a4312ab72d613..77f7fa7c10555 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1730,7 +1730,7 @@ struct llama_model_loader {
         }
     }
 
-    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
+    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
         if (backend != GGML_BACKEND_CPU) {
             ggml_set_no_alloc(ctx, true);
         }
@@ -1748,7 +1748,7 @@ struct llama_model_loader {
         return tensor;
     }
 
-    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
+    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
         struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
 
         if (cur == NULL) {
@@ -2299,8 +2299,8 @@ static void llm_load_tensors(
 
                     // output
                     {
-                        ggml_backend backend_norm;
-                        ggml_backend backend_output;
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2335,8 +2335,8 @@ static void llm_load_tensors(
                     model.layers.resize(n_layer);
 
                     for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
 
                         auto & layer = model.layers[i];
 
@@ -2365,8 +2365,8 @@ static void llm_load_tensors(
                 {
                     model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
                     {
-                        ggml_backend backend_norm;
-                        ggml_backend backend_output;
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2401,8 +2401,8 @@ static void llm_load_tensors(
                     model.layers.resize(n_layer);
 
                     for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
 
                         auto & layer = model.layers[i];
 
@@ -2435,8 +2435,8 @@ static void llm_load_tensors(
 
                     // output
                     {
-                        ggml_backend backend_norm;
-                        ggml_backend backend_output;
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2473,8 +2473,8 @@ static void llm_load_tensors(
                     model.layers.resize(n_layer);
 
                     for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
 
                         auto & layer = model.layers[i];
 
@@ -2512,8 +2512,8 @@ static void llm_load_tensors(
 
                     // output
                     {
-                        ggml_backend backend_norm;
-                        ggml_backend backend_output;
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2550,8 +2550,8 @@ static void llm_load_tensors(
                     model.layers.resize(n_layer);
 
                     for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
 
                         auto & layer = model.layers[i];
 
@@ -2589,8 +2589,8 @@ static void llm_load_tensors(
                     model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);
 
                     {
-                        ggml_backend backend_norm;
-                        ggml_backend backend_output;
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
 
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2624,8 +2624,8 @@ static void llm_load_tensors(
                     const int i_gpu_start = n_layer - n_gpu_layers;
                     model.layers.resize(n_layer);
                     for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
-                        const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
                         auto & layer = model.layers[i];
                         layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
                         layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, backend);
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index e44c3bd03fa93..4311268bd2d17 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -1,16 +1,18 @@
 #!/bin/bash
 
-cp -rpv ../ggml/src/ggml.c                ./ggml.c
-cp -rpv ../ggml/src/ggml-alloc.c          ./ggml-alloc.c
-cp -rpv ../ggml/src/ggml-cuda.h           ./ggml-cuda.h
-cp -rpv ../ggml/src/ggml-cuda.cu          ./ggml-cuda.cu
-cp -rpv ../ggml/src/ggml-opencl.h         ./ggml-opencl.h
-cp -rpv ../ggml/src/ggml-opencl.cpp       ./ggml-opencl.cpp
-cp -rpv ../ggml/src/ggml-metal.h          ./ggml-metal.h
-cp -rpv ../ggml/src/ggml-metal.m          ./ggml-metal.m
-cp -rpv ../ggml/src/ggml-metal.metal      ./ggml-metal.metal
-cp -rpv ../ggml/include/ggml/ggml.h       ./ggml.h
-cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
+cp -rpv ../ggml/src/ggml.c                  ./ggml.c
+cp -rpv ../ggml/src/ggml-alloc.c            ./ggml-alloc.c
+cp -rpv ../ggml/src/ggml-backend.c          ./ggml-backend.c
+cp -rpv ../ggml/src/ggml-cuda.h             ./ggml-cuda.h
+cp -rpv ../ggml/src/ggml-cuda.cu            ./ggml-cuda.cu
+cp -rpv ../ggml/src/ggml-opencl.h           ./ggml-opencl.h
+cp -rpv ../ggml/src/ggml-opencl.cpp         ./ggml-opencl.cpp
+cp -rpv ../ggml/src/ggml-metal.h            ./ggml-metal.h
+cp -rpv ../ggml/src/ggml-metal.m            ./ggml-metal.m
+cp -rpv ../ggml/src/ggml-metal.metal        ./ggml-metal.metal
+cp -rpv ../ggml/include/ggml/ggml.h         ./ggml.h
+cp -rpv ../ggml/include/ggml/ggml-alloc.h   ./ggml-alloc.h
+cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
 
 cp -rpv ../ggml/tests/test-opt.cpp    ./tests/test-opt.cpp
 cp -rpv ../ggml/tests/test-grad0.cpp  ./tests/test-grad0.cpp

From dcc09d25961c5d0626bc148e558ee841141748f7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 9 Oct 2023 14:28:27 +0300
Subject: [PATCH 12/26] metal : do not use mul_mm kernels when ne00 < 64
 (#3542)

---
 ggml-metal.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 29cb3c922daeb..7d67db90f078b 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1040,7 +1040,7 @@ void ggml_metal_graph_compute(
                                 !ggml_is_transposed(src0) &&
                                 !ggml_is_transposed(src1) &&
                                 src1t == GGML_TYPE_F32 &&
-                                ne00 % 32 == 0 &&
+                                ne00 % 32 == 0 && ne00 >= 64 &&
                                 ne11 > ne11_mm_min) {
                                 //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
                                 switch (src0->type) {

From fcca0a700487999d52a525c96d6661e9f6a8703a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 9 Oct 2023 14:32:17 +0300
Subject: [PATCH 13/26] refact : fix convert script + zero out KV cache to
 avoid nans (#3523)

* refact : fix convert script + zero out KV cache to avoid nans

* ggml : silu(-inf) should never happen

* metal : assert various kernel requirements
---
 convert-refact-hf-to-gguf.py   | 71 ++++------------------------------
 examples/parallel/parallel.cpp |  2 +-
 ggml-metal.m                   | 20 ++++++----
 ggml-metal.metal               | 18 ++++++---
 ggml.c                         | 27 +++++++------
 llama.cpp                      |  4 ++
 6 files changed, 51 insertions(+), 91 deletions(-)

diff --git a/convert-refact-hf-to-gguf.py b/convert-refact-hf-to-gguf.py
index e0cd417dbbbc4..bfeabc0825ba7 100755
--- a/convert-refact-hf-to-gguf.py
+++ b/convert-refact-hf-to-gguf.py
@@ -17,33 +17,6 @@
     sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
 import gguf
 
-
-def bytes_to_unicode():
-    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1))
-        + list(range(ord("¡"), ord("¬") + 1))
-        + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    return dict(zip(bs, (chr(n) for n in cs)))
-
-
 def count_model_parts(dir_model: Path) -> int:
     num_parts = 0
     for filename in os.listdir(dir_model):
@@ -153,53 +126,25 @@ def parse_args() -> argparse.Namespace:
 scores: list[float] = []
 toktypes: list[int] = []
 
-tokenizer_json_file = dir_model / "tokenizer.json"
-if not tokenizer_json_file.is_file():
-    print(f"Error: Missing {tokenizer_json_file}", file=sys.stderr)
-    sys.exit(1)
-
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
 
-with open(tokenizer_json_file, "r", encoding="utf-8") as f:
-    tokenizer_json = json.load(f)
-
 print("gguf: get gpt2 tokenizer vocab")
 
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = (
-    hparams["vocab_size"]
-    if "vocab_size" in hparams
-    else len(tokenizer_json["model"]["vocab"])
-)
-
-tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+assert max(tokenizer.vocab.values()) < vocab_size
 
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v: k for k, v in byte_encoder.items()}
 
 for i in range(vocab_size):
-    if i in reverse_vocab:
-        text = reverse_vocab[i]
-        try:
-            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
-        except KeyError:
-            text = bytearray()
-            for c in reverse_vocab[i]:
-                if ord(c) < 256:  # single byte character
-                    text.append(byte_decoder[ord(c)])
-                else:  # multibyte special token character
-                    text.extend(c.encode("utf-8"))
-    else:
-        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
-        pad_token = f"[PAD{i}]".encode("utf8")
-        text = bytearray(pad_token)
-
-    tokens.append(text)
-    scores.append(0.0)  # dymmy
-    toktypes.append(gguf.TokenType.NORMAL)  # dummy
+    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    scores.append(0.0) # dummy
+    toktypes.append(gguf.TokenType.NORMAL)
 
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 721888da7de94..04f1e45b955dd 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -167,7 +167,7 @@ int main(int argc, char ** argv) {
 
     // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
     // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(params.n_ctx, 0);
+    llama_batch batch = llama_batch_init(n_ctx, 0);
 
     int32_t n_total_prompt = 0;
     int32_t n_total_gen    = 0;
diff --git a/ggml-metal.m b/ggml-metal.m
index 7d67db90f078b..5a23144d0c891 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -779,8 +779,8 @@ void ggml_metal_graph_compute(
                         } break;
                     case GGML_OP_CONCAT:
                         {
+                            const int64_t nb = ne00;
 
-                            int64_t nb = ne00;
                             [encoder setComputePipelineState:ctx->pipeline_concat];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
@@ -812,6 +812,7 @@ void ggml_metal_graph_compute(
                             [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
 
                             const int nth = MIN(1024, ne0);
+
                             [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
                     case GGML_OP_ADD:
@@ -909,9 +910,10 @@ void ggml_metal_graph_compute(
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                             [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
 
-                            const int64_t n = ggml_nelements(dst)/4;
+                            const int64_t n = ggml_nelements(dst);
+                            GGML_ASSERT(n % 4 == 0);
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
                     case GGML_OP_UNARY:
                         switch (ggml_get_unary_op(gf->nodes[i])) {
@@ -921,9 +923,10 @@ void ggml_metal_graph_compute(
                                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
-                                    const int64_t n = ggml_nelements(dst)/4;
+                                    const int64_t n = ggml_nelements(dst);
+                                    GGML_ASSERT(n % 4 == 0);
 
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                 } break;
                             case GGML_UNARY_OP_RELU:
                                 {
@@ -941,9 +944,10 @@ void ggml_metal_graph_compute(
                                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
-                                    const int64_t n = ggml_nelements(dst)/4;
+                                    const int64_t n = ggml_nelements(dst);
+                                    GGML_ASSERT(n % 4 == 0);
 
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                 } break;
                             default:
                                 {
@@ -1251,6 +1255,8 @@ void ggml_metal_graph_compute(
                         } break;
                     case GGML_OP_RMS_NORM:
                         {
+                            GGML_ASSERT(ne00 % 4 == 0);
+
                             float eps;
                             memcpy(&eps, dst->op_params, sizeof(float));
 
diff --git a/ggml-metal.metal b/ggml-metal.metal
index b6288db28660d..99b9fd7a748ba 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -345,10 +345,11 @@ kernel void kernel_rms_norm(
         uint sgitg[[simdgroup_index_in_threadgroup]],
         uint tiisg[[thread_index_in_simdgroup]],
         uint   ntg[[threads_per_threadgroup]]) {
-    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
-    device const float * x_scalar = (device const float *) x;
-    float4 sumf=0;
-    float all_sum=0;
+    device const float4 * x        = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
+    device const float  * x_scalar = (device const float  *) x;
+
+    float4 sumf = 0;
+    float all_sum = 0;
 
     // parallel sum
     for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
@@ -361,6 +362,7 @@ kernel void kernel_rms_norm(
     }
 
     threadgroup_barrier(mem_flags::mem_threadgroup);
+
     // broadcast, simd group number is ntg / 32
     for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
        if (tpitg < i) {
@@ -368,7 +370,9 @@ kernel void kernel_rms_norm(
        }
     }
     if (tpitg == 0) {
-        for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
+        for (int i = 4 * (ne00 / 4); i < ne00; i++) {
+            sum[0] += x_scalar[i];
+        }
         sum[0] /= ne00;
     }
 
@@ -383,7 +387,9 @@ kernel void kernel_rms_norm(
         y[i00] = x[i00] * scale;
     }
     if (tpitg == 0) {
-        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
+        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
+            y_scalar[i00] = x_scalar[i00] * scale;
+        }
     }
 }
 
diff --git a/ggml.c b/ggml.c
index 6d1776ca46741..5bb1da31ba624 100644
--- a/ggml.c
+++ b/ggml.c
@@ -11233,7 +11233,7 @@ static void ggml_compute_forward_silu_f32(
 
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
             UNUSED(x);
             assert(!isnan(x));
             assert(!isinf(x));
@@ -13066,17 +13066,17 @@ static void ggml_compute_forward_alibi_f32(
 
     assert(n_past >= 0);
 
-    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
-    const int ne1 = src0->ne[1]; // seq_len_without_past
-    const int ne2 = src0->ne[2]; // n_head -> this is k
-    //const int ne3 = src0->ne[3]; // 1 -> bsz
+    const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
+    const int64_t ne1 = src0->ne[1]; // seq_len_without_past
+    const int64_t ne2 = src0->ne[2]; // n_head -> this is k
+    //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
 
-    const int n  = ggml_nrows(src0);
-    const int ne2_ne3 = n/ne1; // ne2*ne3
+    const int64_t n  = ggml_nrows(src0);
+    const int64_t ne2_ne3 = n/ne1; // ne2*ne3
 
-    const int nb0 = src0->nb[0];
-    const int nb1 = src0->nb[1];
-    const int nb2 = src0->nb[2];
+    const size_t nb0 = src0->nb[0];
+    const size_t nb1 = src0->nb[1];
+    const size_t nb2 = src0->nb[2];
     //const int nb3 = src0->nb[3];
 
     GGML_ASSERT(nb0 == sizeof(float));
@@ -13088,9 +13088,9 @@ static void ggml_compute_forward_alibi_f32(
     const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
     const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
 
-    for (int i = 0; i < ne0; i++) {
-        for (int j = 0; j < ne1; j++) {
-            for (int k = 0; k < ne2_ne3; k++) {
+    for (int64_t i = 0; i < ne0; i++) {
+        for (int64_t j = 0; j < ne1; j++) {
+            for (int64_t k = 0; k < ne2_ne3; k++) {
                 float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
                 float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
 
@@ -13105,7 +13105,6 @@ static void ggml_compute_forward_alibi_f32(
                 }
 
                 pdst[0] = i * m_k + src[0];
-
             }
         }
     }
diff --git a/llama.cpp b/llama.cpp
index 77f7fa7c10555..24f07daca6181 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1325,7 +1325,11 @@ static bool llama_kv_cache_init(
     cache.cells.clear();
     cache.cells.resize(n_ctx);
 
+    // TODO: this should be:
+    //       cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
+    //       change it and test that it works
     cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+    memset(cache.buf.data, 0, cache.buf.size);
 
     struct ggml_init_params params;
     params.mem_size   = cache.buf.size;

From 95bd60a0a69f57e9a2ff1269667ea484a1a9bb40 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Mon, 9 Oct 2023 14:44:58 +0200
Subject: [PATCH 14/26] ggml-alloc : fix assert in debug builds (#3555)

---
 ggml-alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index 3321f05e2ef94..34eba3f830e84 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -386,7 +386,7 @@ static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
 
     // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
     // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
-    assert(ggml_allocr_is_measure(alloc) || view->buffer->backend == alloc->buffer->backend);
+    assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
     ggml_backend_buffer_init_tensor(alloc->buffer, view);
 }
 

From 11ea5c7d96f2c28e1c99659e08ec0a44574056e2 Mon Sep 17 00:00:00 2001
From: vvhg1 <94630311+vvhg1@users.noreply.github.com>
Date: Tue, 10 Oct 2023 09:31:21 +0200
Subject: [PATCH 15/26] infill. : fix tokenization (#3508)

* infill tokens correction

* serverinfill tokens correction

* removing any leading whitespace from infill suffix and removing leeading space token from suffix when params.escape

* removing any leading whitespace from infill suffix and removing leeading space token from suffix when params.escape

* only rm when params.escape, rm space if possible which is added back or rm added space token

* only rm when params.escape, rm space if possible which is added back or rm added space token

* Revert "only rm when params.escape, rm space if possible which is added back or rm added space token"

This reverts commit 63ba0b621f21077c0e3bc6ba6a327534123cb738.

* fix interactive prompt escaping and fix server infill leading space handling

* rm unnecessary bool check
---
 examples/infill/infill.cpp | 37 +++++++++++++++++++++++++++++++++----
 examples/server/server.cpp | 15 +++++++++++++--
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 9ec75ce425b2a..d994de5e850c3 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -233,10 +233,22 @@ int main(int argc, char ** argv) {
     const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
     LOG("add_bos: %d\n", add_bos);
 
+    bool suff_rm_leading_spc = params.escape;
+    if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+        params.input_suffix.erase(0, 1);
+        suff_rm_leading_spc = false;
+    }
     std::vector<llama_token> embd_inp;
-    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
-    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
+    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
+    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
+    const int space_token = 29871;
+    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
+        inp_sfx.erase(inp_sfx.begin());
+    }
     inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
+    if (add_bos) {
+        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
+    }
     inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
     embd_inp = inp_pfx;
     embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
@@ -627,10 +639,27 @@ int main(int argc, char ** argv) {
                 buffer.clear();
                 // done taking input, reset color
                 console::set_display(console::reset);
+
+                if (params.escape) {
+                    //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
+                    process_escapes(params.input_prefix);
+                    process_escapes(params.input_suffix);
+                }
+                suff_rm_leading_spc = params.escape;
+                if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+                    params.input_suffix.erase(0, 1);
+                    suff_rm_leading_spc = false;
+                }
                 // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
-                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
+                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
+                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
+                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
+                    inp_sfx.erase(inp_sfx.begin());
+                }
                 inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
+                if (add_bos) {
+                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
+                }
                 inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
                 embd_inp = inp_pfx;
                 embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c53a64867336f..8c5318c650ae8 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -344,9 +344,20 @@ struct llama_server_context
 
     void loadInfill()
     {
-        auto prefix_tokens = tokenize(params.input_prefix, true);  // always add BOS
-        auto suffix_tokens = tokenize(params.input_suffix, true);  // always add BOS
+        bool suff_rm_leading_spc = true;
+        if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+            params.input_suffix.erase(0, 1);
+            suff_rm_leading_spc = false;
+        }
+
+        auto prefix_tokens = tokenize(params.input_prefix, false);
+        auto suffix_tokens = tokenize(params.input_suffix, false);
+        const int space_token = 29871;
+        if (suff_rm_leading_spc  && suffix_tokens[0] == space_token) {
+            suffix_tokens.erase(suffix_tokens.begin());
+        }
         prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
+        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
         prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
         prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
         prefix_tokens.push_back(llama_token_middle(ctx));

From f5f9121de140eff558f13b5c5e78a3a3b6b94377 Mon Sep 17 00:00:00 2001
From: Jan Ploski <jpl@plosquare.com>
Date: Tue, 10 Oct 2023 09:50:23 +0200
Subject: [PATCH 16/26] llm : add MPT support (#3417)

* CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545)

* mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt

* mpt : protect against "clip_qkv": null in mpt-7b

* mpt : quick fix to avoid "Strange model" warning when quantizing MPT models

* mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?)

* mpt : standardized all tensor names to follow GGUF spec

* mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code

* mpt : fixed comment s/gptneox/mpt/

* mpt : remove tabs, trailing whitespace

* mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt

* mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252

* comment out n_past instead of marking it unused

* mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"]

* mpt : remove unused tokenizer_json in convert script

* ggml : remove obsolete n_past assert in ggml_alibi

* llama : print clam_kqv and max_alibi_bias hparams

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 convert-mpt-hf-to-gguf.py | 216 +++++++++++++++++++
 ggml-cuda.cu              |  47 ++++-
 ggml-metal.m              |   2 +-
 ggml.c                    |   4 +-
 llama.cpp                 | 425 +++++++++++++++++++++++++++++++++++++-
 5 files changed, 685 insertions(+), 9 deletions(-)
 create mode 100755 convert-mpt-hf-to-gguf.py

diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py
new file mode 100755
index 0000000000000..73a4932f7c831
--- /dev/null
+++ b/convert-mpt-hf-to-gguf.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+# HF mpt--> gguf conversion
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import struct
+import sys
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer  # type: ignore[import]
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+
+def count_model_parts(dir_model: Path) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
+    parser.add_argument(
+        "--vocab-only", action="store_true",
+        help="extract only the vocab",
+    )
+    parser.add_argument(
+        "--outfile", type=Path,
+        help="path to write to; default: based on input",
+    )
+    parser.add_argument(
+        "model", type=Path,
+        help="directory containing model file, or model file itself (*.bin)",
+    )
+    parser.add_argument(
+        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
+        help="output format - use 0 for float32, 1 for float16",
+    )
+    return parser.parse_args()
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
+    sys.exit(1)
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
+
+print("gguf: loading model "+dir_model.name)
+
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "MPTForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.MPT
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["n_layers"]
+
+gguf_writer.add_name(dir_model.name)
+gguf_writer.add_context_length(hparams["max_seq_len"])
+gguf_writer.add_embedding_length(hparams["d_model"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
+gguf_writer.add_head_count(hparams["n_heads"])
+gguf_writer.add_layer_norm_eps(1e-05)
+if hparams["attn_config"]["clip_qkv"] is not None:
+    gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
+gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: list[bytearray] = []
+scores: list[float] = []
+toktypes: list[int] = []
+
+# gpt2 tokenizer
+gguf_writer.add_tokenizer_model("gpt2")
+
+print("gguf: get gpt2 tokenizer vocab")
+
+# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
+# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
+# accomodate some "reserved" tokens; this is causing problems down the line in
+# llama.cpp, so we pad the vocab with dummy tokens:
+
+vocab_size = hparams["vocab_size"]
+
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+
+for i in range(vocab_size):
+    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    scores.append(0.0) # dummy
+    toktypes.append(gguf.TokenType.NORMAL)
+
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
+gguf_writer.add_token_types(toktypes)
+
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
+special_vocab.add_to_gguf(gguf_writer)
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = iter(("pytorch_model.bin",))
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    if args.vocab_only:
+        break
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
+            print("Cannot map tensor '" + name + "'")
+            continue # for the sake of compatibility with some old published models, don't quit
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(new_name, data)
+
+        # note: MPT output is tied to (same as) wte in original model;
+        # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
+        if new_name == "token_embd.weight":
+            gguf_writer.add_tensor("output.weight", data)
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print("")
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 7e92c519741b9..654d3632fc179 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -415,6 +415,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_CPY_BLOCK_SIZE 32
 #define CUDA_SCALE_BLOCK_SIZE 256
+#define CUDA_CLAMP_BLOCK_SIZE 256
 #define CUDA_ROPE_BLOCK_SIZE 256
 #define CUDA_ALIBI_BLOCK_SIZE 32
 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
@@ -4585,6 +4586,15 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
     dst[i] = scale * x[i];
 }
 
+static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
+}
 
 template<int qk, int qr, dequantize_kernel_t dq>
 static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
@@ -5475,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
     scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
 }
 
+static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
+    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
+}
+
 template<typename T>
 static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
                           const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
@@ -6419,12 +6434,12 @@ inline void ggml_cuda_op_alibi(
     const int64_t ne02 = src0->ne[2];
     const int64_t nrows = ggml_nrows(src0);
 
-    const int n_past = ((int32_t *) dst->op_params)[0];
+    //const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_head = ((int32_t *) dst->op_params)[1];
     float max_bias;
     memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
 
-    GGML_ASSERT(ne01 + n_past == ne00);
+    //GGML_ASSERT(ne01 + n_past == ne00);
     GGML_ASSERT(n_head == ne02);
 
     const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -6500,6 +6515,24 @@ inline void ggml_cuda_op_scale(
     (void) src1_dd;
 }
 
+inline void ggml_cuda_op_clamp(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const float min = ((float *) dst->op_params)[0];
+    const float max = ((float *) dst->op_params)[1];
+
+    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
+    CUDA_CHECK(cudaGetLastError());
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
 static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
     const int64_t nrows0 = ggml_nrows(src0);
 
@@ -7061,6 +7094,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
 }
 
+static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
+}
+
 static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne == ggml_nelements(src1));
@@ -7470,6 +7507,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
         case GGML_OP_SCALE:
             func = ggml_cuda_scale;
             break;
+        case GGML_OP_CLAMP:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_clamp;
+            break;
         case GGML_OP_CPY:
             func = ggml_cuda_cpy;
             break;
diff --git a/ggml-metal.m b/ggml-metal.m
index 5a23144d0c891..87fa172161405 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1299,7 +1299,7 @@ void ggml_metal_graph_compute(
 
                             const int nth = MIN(1024, ne00);
 
-                            const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
+                            //const int n_past = ((int32_t *) dst->op_params)[0];
                             const int n_head = ((int32_t *) dst->op_params)[1];
                             float max_bias;
                             memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
diff --git a/ggml.c b/ggml.c
index 5bb1da31ba624..1f5598fa6af8f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -13059,13 +13059,11 @@ static void ggml_compute_forward_alibi_f32(
         return;
     }
 
-    const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
+    //const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_head = ((int32_t *) dst->op_params)[1];
     float max_bias;
     memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
 
-    assert(n_past >= 0);
-
     const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
     const int64_t ne1 = src0->ne[1]; // seq_len_without_past
     const int64_t ne2 = src0->ne[2]; // n_head -> this is k
diff --git a/llama.cpp b/llama.cpp
index 24f07daca6181..3b63b64010b0f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -424,6 +424,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
         LLM_ARCH_MPT,
         {
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
     {
@@ -1011,6 +1019,9 @@ struct llama_hparams {
     float rope_freq_base_train;
     float rope_freq_scale_train;
 
+    float f_clamp_kqv;
+    float f_max_alibi_bias;
+
     bool operator!=(const llama_hparams & other) const {
         if (this->vocab_only != other.vocab_only) return true;
         if (this->n_vocab != other.n_vocab) return true;
@@ -2060,6 +2071,20 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_MPT:
+            {
+                hparams.f_clamp_kqv = 0.0f;
+
+                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+                GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
+                GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
+
+                switch (hparams.n_layer) {
+                    case 32: model.type = e_model::MODEL_7B; break;
+                    case 48: model.type = e_model::MODEL_30B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         default: (void)0;
     }
 
@@ -2204,6 +2229,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: n_gqa            = %u\n",     __func__, hparams.n_gqa());
     LLAMA_LOG_INFO("%s: f_norm_eps       = %.1e\n",   __func__, hparams.f_norm_eps);
     LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
+    LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
+    LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
     LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
     LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
     LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
@@ -2649,6 +2676,73 @@ static void llm_load_tensors(
                         layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),   {64}, backend);
                     }
                 } break;
+            case LLM_ARCH_MPT:
+                {
+                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+
+                    // output
+                    {
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
+
+                        if (n_gpu_layers > int(n_layer)) {
+                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+                            // on Windows however this is detrimental unless everything is on the GPU
+#ifndef _WIN32
+                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+#else
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+#endif // _WIN32
+
+                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                        } else {
+                            backend_norm   = GGML_BACKEND_CPU;
+                            backend_output = GGML_BACKEND_CPU;
+                        }
+
+                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+
+                        if (backend_norm == GGML_BACKEND_GPU) {
+                            vram_weights += ggml_nbytes(model.output_norm);
+                        }
+                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+                            vram_weights += ggml_nbytes(model.output);
+                        }
+                    }
+
+                    const uint32_t n_ff = hparams.n_ff;
+
+                    const int i_gpu_start = n_layer - n_gpu_layers;
+
+                    model.layers.resize(n_layer);
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
+                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+
+                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+
+                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+
+                        if (backend == GGML_BACKEND_GPU) {
+                            vram_weights +=
+                                ggml_nbytes(layer.attn_norm) +
+                                ggml_nbytes(layer.wqkv)      +
+                                ggml_nbytes(layer.wo)        +
+                                ggml_nbytes(layer.ffn_norm)  +
+                                ggml_nbytes(layer.w2)        +
+                                ggml_nbytes(layer.w3);
+                        }
+                    }
+                } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
@@ -4505,7 +4599,6 @@ static struct ggml_cgraph * llm_build_starcoder(
     return gf;
 }
 
-
 static struct ggml_cgraph * llm_build_persimmon(
          llama_context & lctx,
      const llama_batch & batch) {
@@ -4903,6 +4996,326 @@ static struct ggml_cgraph * llm_build_persimmon(
     return gf;
 }
 
+static struct ggml_cgraph * llm_build_mpt(
+         llama_context & lctx,
+     const llama_batch & batch) {
+    const auto & model   = lctx.model;
+    const auto & hparams = model.hparams;
+    const auto & cparams = lctx.cparams;
+
+    const auto & kv_self = lctx.kv_self;
+
+    GGML_ASSERT(!!kv_self.ctx);
+
+    const int64_t n_embd      = hparams.n_embd;
+    const int64_t n_layer     = hparams.n_layer;
+    const int64_t n_ctx       = cparams.n_ctx;
+    const int64_t n_head      = hparams.n_head;
+    const int64_t n_head_kv   = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
+    const int64_t n_embd_head = hparams.n_embd_head();
+    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+
+    const float norm_eps       = hparams.f_norm_eps;
+    const float clamp_kqv      = hparams.f_clamp_kqv;
+    const float max_alibi_bias = hparams.f_max_alibi_bias;
+
+    const int n_gpu_layers = model.n_gpu_layers;
+
+    const int32_t n_tokens = batch.n_tokens;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
+    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
+
+    //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
+    //        kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
+
+    auto & buf_compute = lctx.buf_compute;
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_compute.size,
+        /*.mem_buffer =*/ buf_compute.data,
+        /*.no_alloc   =*/ false,
+    };
+
+    params.no_alloc = true;
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * cur;
+    struct ggml_tensor * inpL;
+
+    //int warmup = 0;
+    if (batch.token) {
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+
+        ggml_allocr_alloc(lctx.alloc, inp_tokens);
+        if (!ggml_allocr_is_measure(lctx.alloc)) {
+            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
+            //warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
+        }
+
+        ggml_set_name(inp_tokens, "inp_tokens");
+
+        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
+    } else {
+#ifdef GGML_USE_MPI
+        GGML_ASSERT(false && "not implemented");
+#endif
+
+        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+
+        ggml_allocr_alloc(lctx.alloc, inpL);
+        if (!ggml_allocr_is_measure(lctx.alloc)) {
+            memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
+        }
+    }
+
+    const int i_gpu_start = n_layer - n_gpu_layers;
+    (void) i_gpu_start;
+
+    // offload functions set the tensor output backend to GPU
+    // tensors are GPU-accelerated if any input or the output has been offloaded
+    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
+    offload_func_t offload_func_kq = llama_nop;
+    offload_func_t offload_func_v  = llama_nop;
+
+#ifdef GGML_USE_CUBLAS
+    if (n_gpu_layers > n_layer) {
+        offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
+    }
+    if (n_gpu_layers > n_layer + 1) {
+        offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
+    }
+    if (n_gpu_layers > n_layer + 2) {
+        offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
+    }
+#endif // GGML_USE_CUBLAS
+
+    // KQ_scale
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
+    ggml_allocr_alloc(lctx.alloc, KQ_scale);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+    }
+
+    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    offload_func_kq(KQ_mask);
+    ggml_set_name(KQ_mask, "KQ_mask");
+    ggml_allocr_alloc(lctx.alloc, KQ_mask);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        float * data = (float *) KQ_mask->data;
+        memset(data, 0, ggml_nbytes(KQ_mask));
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                const llama_pos    pos    = batch.pos[j];
+                const llama_seq_id seq_id = batch.seq_id[j];
+
+                for (int i = 0; i < n_kv; ++i) {
+                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
+                    }
+                }
+            }
+        }
+    }
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * attn_norm;
+
+        offload_func_t offload_func = llama_nop;
+
+#ifdef GGML_USE_CUBLAS
+        if (il >= i_gpu_start) {
+            offload_func = ggml_cuda_assign_buffers_no_alloc;
+        }
+#endif // GGML_USE_CUBLAS
+
+        // self-attention
+        // TODO: refactor into common function (shared with LLaMA)
+        {
+            attn_norm = ggml_norm(ctx0, inpL, norm_eps);
+            offload_func(attn_norm);
+
+            attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
+            offload_func(attn_norm);
+
+            if (1) {
+                cur = attn_norm;
+            }
+
+            // compute QKV
+
+            cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+            offload_func_kq(cur);
+
+            if (clamp_kqv > 0.0f) {
+                cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
+                offload_func_kq(cur);
+            }
+
+            const size_t wsize = ggml_type_size(cur->type);
+
+            struct ggml_tensor * Qcur = ggml_view_3d(
+                ctx0, cur, n_embd_head, n_head, n_tokens,
+                wsize * n_embd_head,
+                wsize * n_embd_head * (n_head + 2 * n_head_kv),
+                0);
+            offload_func_kq(Qcur);
+
+            struct ggml_tensor * Kcur = ggml_view_3d(
+                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
+                wsize * n_embd_head,
+                wsize * n_embd_head * (n_head + 2 * n_head_kv),
+                wsize * n_embd_head *  n_head);
+            offload_func_kq(Kcur);
+
+            struct ggml_tensor * tmpv = ggml_view_3d(
+                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
+                wsize * n_embd_head,
+                wsize * n_embd_head * (n_head + 2 * n_head_kv),
+                wsize * n_embd_head * (n_head +     n_head_kv));
+            offload_func_kq(Kcur);
+
+            ggml_set_name(Qcur, "Qcur");
+            ggml_set_name(Kcur, "Kcur");
+
+            {
+                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
+                offload_func_v(Vcur);
+                offload_func_v(Vcur->src[0]->src[0]);
+                ggml_set_name(Vcur, "Vcur");
+
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
+                offload_func_kq(k);
+                ggml_set_name(k, "k");
+
+                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
+                        (   n_ctx)*ggml_element_size(kv_self.v),
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
+                offload_func_v(v);
+
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+            offload_func_kq(Q);
+            ggml_set_name(Q, "Q");
+
+            struct ggml_tensor * K =
+                ggml_view_3d(ctx0, kv_self.k,
+                        n_embd_head, n_kv, n_head_kv,
+                        ggml_element_size(kv_self.k)*n_embd_gqa,
+                        ggml_element_size(kv_self.k)*n_embd_head,
+                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
+            offload_func_kq(K);
+            ggml_set_name(K, "K");
+
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            offload_func_kq(KQ);
+            ggml_set_name(KQ, "KQ");
+
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
+            offload_func_kq(KQ_scaled);
+            ggml_set_name(KQ_scaled, "KQ_scaled");
+
+            // TODO: replace with ggml_add()
+            struct ggml_tensor * KQ_scaled_alibi =
+                ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
+            offload_func_kq(KQ_scaled_alibi);
+            ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
+
+            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
+            offload_func_kq(KQ_masked);
+            ggml_set_name(KQ_masked, "KQ_masked");
+
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            offload_func_v(KQ_soft_max);
+            ggml_set_name(KQ_soft_max, "KQ_soft_max");
+
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, kv_self.v,
+                        n_kv, n_embd_head, n_head_kv,
+                        ggml_element_size(kv_self.v)*n_ctx,
+                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
+                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
+            offload_func_v(V);
+            ggml_set_name(V, "V");
+
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            offload_func_v(KQV);
+            ggml_set_name(KQV, "KQV");
+
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            offload_func_v(KQV_merged);
+            ggml_set_name(KQV_merged, "KQV_merged");
+
+            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
+            offload_func_v(cur);
+            ggml_set_name(cur, "KQV_merged_contiguous");
+
+            cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
+            offload_func(cur);
+            ggml_set_name(cur, "result_wo");
+        }
+
+        // Add the input
+        cur = ggml_add(ctx0, cur, inpL);
+        offload_func(cur);
+
+        struct ggml_tensor * attn_out = cur;
+
+        // feed forward
+        {
+            // Norm
+            {
+                cur = ggml_norm(ctx0, attn_out, norm_eps);
+                offload_func(cur);
+
+                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
+                offload_func(cur);
+            }
+
+            cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
+            offload_func(cur);
+
+            cur = ggml_gelu(ctx0, cur);
+            offload_func(cur);
+            cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
+            offload_func(cur);
+        }
+
+        cur = ggml_add(ctx0, cur, attn_out);
+        offload_func(cur);
+        // input for next layer
+        inpL = cur;
+    }
+
+    cur = inpL;
+
+    // norm
+    {
+        cur = ggml_norm(ctx0, cur, norm_eps);
+        offload_func_nr(cur);
+
+        cur = ggml_mul(ctx0, cur, model.output_norm);
+        ggml_set_name(cur, "result_norm");
+    }
+
+    cur = ggml_mul_mat(ctx0, model.output, cur);
+    ggml_set_name(cur, "result_output");
+
+    ggml_build_forward_expand(gf, cur);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
 static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
      const llama_batch & batch) {
@@ -4935,6 +5348,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm_build_refact(lctx, batch);
             } break;
+        case LLM_ARCH_MPT:
+            {
+                result = llm_build_mpt(lctx, batch);
+            } break;
         default:
             GGML_ASSERT(false);
     }
@@ -5065,7 +5482,8 @@ static int llama_decode_internal(
     const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
         model.arch == LLM_ARCH_BAICHUAN ||
         model.arch == LLM_ARCH_FALCON ||
-        model.arch == LLM_ARCH_REFACT;
+        model.arch == LLM_ARCH_REFACT ||
+        model.arch == LLM_ARCH_MPT;
     const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
     if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
         n_threads = 1;
@@ -7161,7 +7579,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         const std::string name = ggml_get_name(meta);
 
         // TODO: avoid hardcoded tensor names - use the TN_* constants
-        if (name.find("attn_v.weight") != std::string::npos) {
+        if (name.find("attn_v.weight") != std::string::npos ||
+            name.find("attn_qkv.weight") != std::string::npos) {
             ++n_attention_wv;
         }
         else if (name.find("ffn_down.weight") != std::string::npos) {

From 0aa6595ae02f97f2e5ffd74bf57a8b21ac83b272 Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <iainst0409@gmail.com>
Date: Tue, 10 Oct 2023 06:31:13 -0500
Subject: [PATCH 17/26] swift : improvements and fixes (#3564)

* swift : use macOS 12 as minimum requirement

* swift : add missing ggml-backend.c source

* swift : add -O3 -DNDEBUG unsafe flags
---
 Package.swift | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Package.swift b/Package.swift
index 1ea414cc149fd..4ab055b19da2e 100644
--- a/Package.swift
+++ b/Package.swift
@@ -1,10 +1,10 @@
-// swift-tools-version:5.3
+// swift-tools-version:5.5
 
 import PackageDescription
 
 #if arch(arm) || arch(arm64)
 let platforms: [SupportedPlatform]? = [
-    .macOS(.v11),
+    .macOS(.v12),
     .iOS(.v14),
     .watchOS(.v4),
     .tvOS(.v14)
@@ -41,12 +41,13 @@ let package = Package(
                 "ggml.c",
                 "llama.cpp",
                 "ggml-alloc.c",
+                "ggml-backend.c",
                 "k_quants.c",
             ] + additionalSources,
             resources: resources,
             publicHeadersPath: "spm-headers",
             cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32"]),
+                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
                 .define("GGML_USE_K_QUANTS"),
                 .define("GGML_USE_ACCELERATE")
                 // NOTE: NEW_LAPACK will required iOS version 16.4+

From 02d2875deff28599c6c2c6e1886fab002ffe43b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xingchen=20Song=28=E5=AE=8B=E6=98=9F=E8=BE=B0=29?=
 <xingchensong1996@163.com>
Date: Tue, 10 Oct 2023 22:48:21 +0800
Subject: [PATCH 18/26] llm : add bloom models (#3553)

* feat: Support bloom models

* fix(bloom): fix model size

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 convert-bloom-hf-to-gguf.py | 238 ++++++++++++++++++++++
 gguf-py/gguf/gguf.py        | 112 +++++++----
 llama.cpp                   | 383 ++++++++++++++++++++++++++++++++++--
 3 files changed, 678 insertions(+), 55 deletions(-)
 create mode 100755 convert-bloom-hf-to-gguf.py

diff --git a/convert-bloom-hf-to-gguf.py b/convert-bloom-hf-to-gguf.py
new file mode 100755
index 0000000000000..7bfc95ec11dae
--- /dev/null
+++ b/convert-bloom-hf-to-gguf.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+# HF bloom --> gguf conversion
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import struct
+import sys
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer  # type: ignore[import]
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+
+def count_model_parts(dir_model: Path) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+# Supported Models:
+#   https://huggingface.co/bigscience/bloom-1b7
+#   https://huggingface.co/bigscience/bloom-3b
+#   https://huggingface.co/bigscience/bloom-7b1
+#   https://huggingface.co/Langboat/bloom-1b4-zh
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
+    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
+    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")
+    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")
+    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
+    return parser.parse_args()
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
+    sys.exit(1)
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
+
+print("gguf: loading model "+dir_model.name)
+
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "BloomForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+    sys.exit(1)
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.BLOOM
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["n_layer"]
+
+gguf_writer.add_name("Bloom")
+n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
+n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
+gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
+gguf_writer.add_embedding_length(n_embed)
+gguf_writer.add_feed_forward_length(4 * n_embed)
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_head_count(n_head)
+gguf_writer.add_head_count_kv(n_head)
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
+gguf_writer.add_file_type(ftype)
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: list[bytearray] = []
+scores: list[float] = []
+toktypes: list[int] = []
+
+# gpt2 tokenizer
+gguf_writer.add_tokenizer_model("gpt2")
+
+print("gguf: get gpt2 tokenizer vocab")
+
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+assert max(tokenizer.vocab.values()) < vocab_size
+
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+
+for i in range(vocab_size):
+    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
+    scores.append(0.0)  # dummy
+    toktypes.append(gguf.TokenType.NORMAL)
+
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
+gguf_writer.add_token_types(toktypes)
+
+special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+special_vocab.add_to_gguf(gguf_writer)
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
+
+# params for qkv transform
+n_head_kv = hparams.get("n_head_kv", n_head)
+head_dim = n_embed // n_head
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = iter(("pytorch_model.bin",))
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    if args.vocab_only:
+        break
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(dir_model / part_name, map_location="cpu")
+
+    has_lm_head = True
+    if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
+        has_lm_head = False
+
+    for original_name in model_part.keys():
+        data = model_part[original_name]
+        name = re.sub(r'transformer\.', '', original_name)
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
+            # Map bloom-style qkv_linear to gpt-style qkv_linear
+            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
+            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
+            qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
+            data = np.concatenate(
+                (qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
+                 qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
+                 qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
+                axis=0
+            )
+            print("re-format attention.linear_qkv.weight")
+        elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
+            qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
+            data = np.concatenate(
+                (qkv_bias[:, 0, :].reshape((n_embed,)),
+                 qkv_bias[:, 1, :].reshape((n_embed,)),
+                 qkv_bias[:, 2, :].reshape((n_embed,))),
+                axis=0
+            )
+            print("re-format attention.linear_qkv.bias")
+
+        # map tensor names
+        new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+        if new_name is None:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(new_name, data)
+
+        if not has_lm_head and name == "word_embeddings.weight":
+            gguf_writer.add_tensor("output.weight", data)
+            print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))  # noqa
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print("")
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index fb677a6ed7283..557ce7ac0173c 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -88,29 +88,31 @@ class MODEL_ARCH(IntEnum):
     PERSIMMON     : int = auto()
     REFACT        : int = auto()
     BERT          : int = auto()
+    BLOOM         : int = auto()
 
 
 class MODEL_TENSOR(IntEnum):
-    TOKEN_EMBD   : int = auto()
-    TOKEN_TYPES  : int = auto()
-    POS_EMBD     : int = auto()
-    OUTPUT       : int = auto()
-    OUTPUT_NORM  : int = auto()
-    ROPE_FREQS   : int = auto()
-    ATTN_Q       : int = auto()
-    ATTN_K       : int = auto()
-    ATTN_V       : int = auto()
-    ATTN_QKV     : int = auto()
-    ATTN_OUT     : int = auto()
-    ATTN_NORM    : int = auto()
-    ATTN_NORM_2  : int = auto()
-    ATTN_ROT_EMBD: int = auto()
-    FFN_GATE     : int = auto()
-    FFN_DOWN     : int = auto()
-    FFN_UP       : int = auto()
-    FFN_NORM     : int = auto()
-    ATTN_Q_NORM  : int = auto()
-    ATTN_K_NORM  : int = auto()
+    TOKEN_EMBD      : int = auto()
+    TOKEN_EMBD_NORM : int = auto()
+    TOKEN_TYPES     : int = auto()
+    POS_EMBD        : int = auto()
+    OUTPUT          : int = auto()
+    OUTPUT_NORM     : int = auto()
+    ROPE_FREQS      : int = auto()
+    ATTN_Q          : int = auto()
+    ATTN_K          : int = auto()
+    ATTN_V          : int = auto()
+    ATTN_QKV        : int = auto()
+    ATTN_OUT        : int = auto()
+    ATTN_NORM       : int = auto()
+    ATTN_NORM_2     : int = auto()
+    ATTN_ROT_EMBD   : int = auto()
+    FFN_GATE        : int = auto()
+    FFN_DOWN        : int = auto()
+    FFN_UP          : int = auto()
+    FFN_NORM        : int = auto()
+    ATTN_Q_NORM     : int = auto()
+    ATTN_K_NORM     : int = auto()
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -125,29 +127,31 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.PERSIMMON:      "persimmon",
     MODEL_ARCH.REFACT:         "refact",
     MODEL_ARCH.BERT:           "bert",
+    MODEL_ARCH.BLOOM:          "bloom",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
-    MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
-    MODEL_TENSOR.TOKEN_TYPES:   "token_types",
-    MODEL_TENSOR.POS_EMBD:      "position_embd",
-    MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
-    MODEL_TENSOR.OUTPUT:        "output",
-    MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
-    MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
-    MODEL_TENSOR.ATTN_NORM_2:   "blk.{bid}.attn_norm_2",
-    MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
-    MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
-    MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
-    MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
-    MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
-    MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
-    MODEL_TENSOR.ATTN_Q_NORM:   "blk.{bid}.attn_q_norm",
-    MODEL_TENSOR.ATTN_K_NORM:   "blk.{bid}.attn_k_norm",
-    MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
-    MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
-    MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
-    MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    MODEL_TENSOR.TOKEN_EMBD:      "token_embd",
+    MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
+    MODEL_TENSOR.TOKEN_TYPES:     "token_types",
+    MODEL_TENSOR.POS_EMBD:        "position_embd",
+    MODEL_TENSOR.OUTPUT_NORM:     "output_norm",
+    MODEL_TENSOR.OUTPUT:          "output",
+    MODEL_TENSOR.ROPE_FREQS:      "rope_freqs",
+    MODEL_TENSOR.ATTN_NORM:       "blk.{bid}.attn_norm",
+    MODEL_TENSOR.ATTN_NORM_2:     "blk.{bid}.attn_norm_2",
+    MODEL_TENSOR.ATTN_QKV:        "blk.{bid}.attn_qkv",
+    MODEL_TENSOR.ATTN_Q:          "blk.{bid}.attn_q",
+    MODEL_TENSOR.ATTN_K:          "blk.{bid}.attn_k",
+    MODEL_TENSOR.ATTN_V:          "blk.{bid}.attn_v",
+    MODEL_TENSOR.ATTN_OUT:        "blk.{bid}.attn_output",
+    MODEL_TENSOR.ATTN_ROT_EMBD:   "blk.{bid}.attn_rot_embd",
+    MODEL_TENSOR.ATTN_Q_NORM:     "blk.{bid}.attn_q_norm",
+    MODEL_TENSOR.ATTN_K_NORM:     "blk.{bid}.attn_k_norm",
+    MODEL_TENSOR.FFN_NORM:        "blk.{bid}.ffn_norm",
+    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
+    MODEL_TENSOR.FFN_DOWN:        "blk.{bid}.ffn_down",
+    MODEL_TENSOR.FFN_UP:          "blk.{bid}.ffn_up",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -282,6 +286,18 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.BLOOM: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.TOKEN_EMBD_NORM,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.GPT2: [
         # TODO
     ],
@@ -311,6 +327,7 @@ class TensorNameMap:
             "gpt_neox.embed_in",                        # gptneox
             "transformer.wte",                          # gpt2 gpt-j mpt refact
             "transformer.word_embeddings",              # falcon
+            "word_embeddings",                          # bloom
             "model.embed_tokens",                       # llama-hf
             "tok_embeddings",                           # llama-pth
             "embeddings.word_embeddings",               # bert
@@ -322,6 +339,11 @@ class TensorNameMap:
             "embeddings.token_type_embeddings",  # bert
         ),
 
+        # Normalization of token embeddings
+        MODEL_TENSOR.TOKEN_EMBD_NORM: (
+            "word_embeddings_layernorm",  # bloom
+        ),
+
         # Position embeddings
         MODEL_TENSOR.POS_EMBD: (
             "transformer.wpe",                 # gpt2
@@ -332,7 +354,7 @@ class TensorNameMap:
         MODEL_TENSOR.OUTPUT: (
             "embed_out",                # gptneox
             "lm_head",                  # gpt2 mpt falcon llama-hf baichuan
-            "output",                   # llama-pth
+            "output",                   # llama-pth bloom
             "word_embeddings_for_head", # persimmon
         ),
 
@@ -344,7 +366,7 @@ class TensorNameMap:
             "norm",                                   # llama-pth
             "embeddings.LayerNorm",                   # bert
             "transformer.norm_f",                     # mpt
-            "ln_f",                                   # refact
+            "ln_f",                                   # refact bloom
             "language_model.encoder.final_layernorm", # persimmon
         ),
 
@@ -361,6 +383,7 @@ class TensorNameMap:
             "transformer.h.{bid}.ln_1",                            # gpt2 gpt-j refact
             "transformer.blocks.{bid}.norm_1",                     # mpt
             "transformer.h.{bid}.input_layernorm",                 # falcon7b
+            "h.{bid}.input_layernorm",                             # bloom
             "transformer.h.{bid}.ln_mlp",                          # falcon40b
             "model.layers.{bid}.input_layernorm",                  # llama-hf
             "layers.{bid}.attention_norm",                         # llama-pth
@@ -379,6 +402,7 @@ class TensorNameMap:
             "transformer.h.{bid}.attn.c_attn",                                    # gpt2
             "transformer.blocks.{bid}.attn.Wqkv",                                 # mpt
             "transformer.h.{bid}.self_attention.query_key_value",                 # falcon
+            "h.{bid}.self_attention.query_key_value",                             # bloom
             "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
         ),
 
@@ -412,6 +436,7 @@ class TensorNameMap:
             "transformer.h.{bid}.attn.c_proj",                         # gpt2 refact
             "transformer.blocks.{bid}.attn.out_proj",                  # mpt
             "transformer.h.{bid}.self_attention.dense",                # falcon
+            "h.{bid}.self_attention.dense",                            # bloom
             "model.layers.{bid}.self_attn.o_proj",                     # llama-hf
             "layers.{bid}.attention.wo",                               # llama-pth
             "encoder.layer.{bid}.attention.output.dense",              # bert
@@ -429,6 +454,7 @@ class TensorNameMap:
         MODEL_TENSOR.FFN_NORM: (
             "gpt_neox.layers.{bid}.post_attention_layernorm",               # gptneox
             "transformer.h.{bid}.ln_2",                                     # gpt2 refact
+            "h.{bid}.post_attention_layernorm",                             # bloom
             "transformer.blocks.{bid}.norm_2",                              # mpt
             "model.layers.{bid}.post_attention_layernorm",                  # llama-hf
             "layers.{bid}.ffn_norm",                                        # llama-pth
@@ -442,6 +468,7 @@ class TensorNameMap:
             "transformer.h.{bid}.mlp.c_fc",                          # gpt2
             "transformer.blocks.{bid}.ffn.up_proj",                  # mpt
             "transformer.h.{bid}.mlp.dense_h_to_4h",                 # falcon
+            "h.{bid}.mlp.dense_h_to_4h",                             # bloom
             "model.layers.{bid}.mlp.up_proj",                        # llama-hf refact
             "layers.{bid}.feed_forward.w3",                          # llama-pth
             "encoder.layer.{bid}.intermediate.dense",                # bert
@@ -461,6 +488,7 @@ class TensorNameMap:
             "transformer.h.{bid}.mlp.c_proj",                        # gpt2 refact
             "transformer.blocks.{bid}.ffn.down_proj",                # mpt
             "transformer.h.{bid}.mlp.dense_4h_to_h",                 # falcon
+            "h.{bid}.mlp.dense_4h_to_h",                             # bloom
             "model.layers.{bid}.mlp.down_proj",                      # llama-hf
             "layers.{bid}.feed_forward.w2",                          # llama-pth
             "encoder.layer.{bid}.output.dense",                      # bert
diff --git a/llama.cpp b/llama.cpp
index 3b63b64010b0f..4653c80232c5c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -188,6 +188,7 @@ enum llm_arch {
     LLM_ARCH_STARCODER,
     LLM_ARCH_PERSIMMON,
     LLM_ARCH_REFACT,
+    LLM_ARCH_BLOOM,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -201,7 +202,8 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_BAICHUAN,        "baichuan"  },
     { LLM_ARCH_STARCODER,       "starcoder" },
     { LLM_ARCH_PERSIMMON,       "persimmon" },
-    { LLM_ARCH_REFACT,          "refact" },
+    { LLM_ARCH_REFACT,          "refact"    },
+    { LLM_ARCH_BLOOM,           "bloom"     },
 };
 
 enum llm_kv {
@@ -304,6 +306,7 @@ struct LLM_KV {
 
 enum llm_tensor {
     LLM_TENSOR_TOKEN_EMBD,
+    LLM_TENSOR_TOKEN_EMBD_NORM,
     LLM_TENSOR_POS_EMBD,
     LLM_TENSOR_OUTPUT,
     LLM_TENSOR_OUTPUT_NORM,
@@ -466,6 +469,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_BLOOM,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -1207,6 +1225,8 @@ struct llama_model {
 
     struct ggml_tensor * tok_embeddings;
     struct ggml_tensor * pos_embeddings;
+    struct ggml_tensor * tok_norm;
+    struct ggml_tensor * tok_norm_b;
 
     struct ggml_tensor * output_norm;
     struct ggml_tensor * output_norm_b;
@@ -2056,13 +2076,13 @@ static void llm_load_hparams(
                 }
             } break;
         case LLM_ARCH_PERSIMMON:
-        {
-            GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
-            switch (hparams.n_layer) {
-                case 36: model.type = e_model::MODEL_8B; break;
-                default: model.type = e_model::MODEL_UNKNOWN;
-            }
-        } break;
+            {
+                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+                switch (hparams.n_layer) {
+                    case 36: model.type = e_model::MODEL_8B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_REFACT:
             {
                 GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@@ -2071,6 +2091,19 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_BLOOM:
+            {
+                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+
+                switch (hparams.n_layer) {
+                    case 24: model.type = e_model::MODEL_1B; break;
+                    case 30:
+                        switch (hparams.n_embd) {
+                            case 2560: model.type = e_model::MODEL_3B; break;
+                            case 4096: model.type = e_model::MODEL_7B; break;
+                        } break;
+                }
+            } break;
         case LLM_ARCH_MPT:
             {
                 hparams.f_clamp_kqv = 0.0f;
@@ -2676,6 +2709,88 @@ static void llm_load_tensors(
                         layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),   {64}, backend);
                     }
                 } break;
+            case LLM_ARCH_BLOOM:
+                {
+                    // TODO: CPU-only for now
+
+                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_norm       = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd},          GGML_BACKEND_CPU);
+                    model.tok_norm_b     = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd},          GGML_BACKEND_CPU);
+
+                    // output
+                    {
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
+
+                        if (n_gpu_layers > int(n_layer)) {
+                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+                            // on Windows however this is detrimental unless everything is on the GPU
+#ifndef _WIN32
+                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+#else
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+#endif // _WIN32
+
+                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                        } else {
+                            backend_norm   = GGML_BACKEND_CPU;
+                            backend_output = GGML_BACKEND_CPU;
+                        }
+
+                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
+                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+
+                        if (backend_norm == GGML_BACKEND_GPU) {
+                            vram_weights += ggml_nbytes(model.output_norm);
+                            vram_weights += ggml_nbytes(model.output_norm_b);
+                        }
+                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+                            vram_weights += ggml_nbytes(model.output);
+                        }
+                    }
+
+                    const uint32_t n_ff = hparams.n_ff;
+
+                    const int i_gpu_start = n_layer - n_gpu_layers;
+
+                    model.layers.resize(n_layer);
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
+                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
+
+                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend_split);
+
+                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
+                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},                        backend_split);
+
+                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
+
+                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
+                        layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend_split);
+
+                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend_split);
+
+                        if (backend == GGML_BACKEND_GPU) {
+                            vram_weights +=
+                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
+                                ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
+                                ggml_nbytes(layer.wo)        + ggml_nbytes(layer.bo)          +
+                                ggml_nbytes(layer.ffn_norm)  + ggml_nbytes(layer.ffn_norm_b)  +
+                                ggml_nbytes(layer.w3)        + ggml_nbytes(layer.b3)          +
+                                ggml_nbytes(layer.w2)        + ggml_nbytes(layer.b2);
+                        }
+                    }
+                } break;
             case LLM_ARCH_MPT:
                 {
                     model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
@@ -4996,6 +5111,248 @@ static struct ggml_cgraph * llm_build_persimmon(
     return gf;
 }
 
+static struct ggml_cgraph * llm_build_bloom(
+         llama_context & lctx,
+     const llama_batch & batch) {
+    const auto & model   = lctx.model;
+    const auto & hparams = model.hparams;
+    const auto & cparams = lctx.cparams;
+
+    const auto & kv_self = lctx.kv_self;
+
+    GGML_ASSERT(!!kv_self.ctx);
+
+    const int64_t n_embd      = hparams.n_embd;
+    const int64_t n_layer     = hparams.n_layer;
+    const int64_t n_ctx       = cparams.n_ctx;
+    const int64_t n_head      = hparams.n_head;
+    const int64_t n_head_kv   = hparams.n_head_kv;
+    const int64_t n_embd_head = hparams.n_embd_head();
+    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+
+    GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+    const float norm_eps = hparams.f_norm_eps;
+
+    const int32_t n_tokens = batch.n_tokens;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
+    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
+
+    auto & buf_compute = lctx.buf_compute;
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_compute.size,
+        /*.mem_buffer =*/ buf_compute.data,
+        /*.no_alloc   =*/ false,
+    };
+
+    params.no_alloc = true;
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * cur;
+    struct ggml_tensor * token;
+    struct ggml_tensor * inpL;
+
+    if (batch.token) {
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+
+        ggml_allocr_alloc(lctx.alloc, inp_tokens);
+        if (!ggml_allocr_is_measure(lctx.alloc)) {
+            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
+        }
+        ggml_set_name(inp_tokens, "inp_tokens");
+
+        token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
+    } else {
+#ifdef GGML_USE_MPI
+        GGML_ASSERT(false && "not implemented");
+#endif
+
+        token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+
+        ggml_allocr_alloc(lctx.alloc, token);
+        if (!ggml_allocr_is_measure(lctx.alloc)) {
+            memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
+        }
+    }
+
+    // KQ_scale
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
+    ggml_allocr_alloc(lctx.alloc, KQ_scale);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+    }
+
+    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    ggml_set_name(KQ_mask, "KQ_mask");
+    ggml_allocr_alloc(lctx.alloc, KQ_mask);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        float * data = (float *) KQ_mask->data;
+        memset(data, 0, ggml_nbytes(KQ_mask));
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                const llama_pos    pos    = batch.pos[j];
+                const llama_seq_id seq_id = batch.seq_id[j];
+
+                for (int i = 0; i < n_kv; ++i) {
+                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
+                    }
+                }
+            }
+        }
+    }
+
+    // norm
+    {
+        inpL = ggml_norm(ctx0, token, norm_eps);
+        inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
+    }
+
+    ggml_set_name(inpL, "inpL");
+
+    for (int il = 0; il < n_layer; ++il) {
+        {
+            // Norm
+            cur = ggml_norm(ctx0, inpL, norm_eps);
+            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
+        }
+
+        {
+            // Self Attention
+            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
+
+            struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
+            struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
+
+            struct ggml_tensor * Qcur = tmpq;
+            struct ggml_tensor * Kcur = tmpk;
+
+            // store key and value to memory
+            {
+                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
+                ggml_set_name(Vcur, "Vcur");
+
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
+                ggml_set_name(k, "k");
+
+                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
+                        (   n_ctx)*ggml_element_size(kv_self.v),
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
+
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
+                        0, 2, 1, 3);
+            ggml_set_name(Q, "Q");
+
+            struct ggml_tensor * K =
+                ggml_view_3d(ctx0, kv_self.k,
+                        n_embd_head, n_kv, n_head_kv,
+                        ggml_element_size(kv_self.k)*n_embd_gqa,
+                        ggml_element_size(kv_self.k)*n_embd_head,
+                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
+            ggml_set_name(K, "K");
+
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            ggml_set_name(KQ, "KQ");
+
+            // KQ_scaled = KQ / sqrt(n_embd_head)
+            // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
+            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
+            ggml_set_name(KQ_scaled, "KQ_scaled");
+
+            struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
+            ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
+
+            // KQ_masked = mask_past(KQ_scaled)
+            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
+            ggml_set_name(KQ_masked, "KQ_masked");
+
+            // KQ = soft_max(KQ_masked)
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+            ggml_set_name(KQ_soft_max, "KQ_soft_max");
+
+            // split cached V into n_head heads
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, kv_self.v,
+                        n_kv, n_embd_head, n_head_kv,
+                        ggml_element_size(kv_self.v)*n_ctx,
+                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
+                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
+            ggml_set_name(V, "V");
+
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            ggml_set_name(KQV, "KQV");
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            ggml_set_name(KQV_merged, "KQV_merged");
+
+            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
+            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
+            ggml_set_name(cur, "KQV_merged_contiguous");
+        }
+
+        // Projection
+        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
+
+        // Add the input
+        cur = ggml_add(ctx0, cur, inpL);
+
+        struct ggml_tensor * inpFF = cur;
+
+        // FF
+        {
+            // Norm
+            {
+                cur = ggml_norm(ctx0, inpFF, norm_eps);
+                cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
+            }
+
+            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
+
+            // GELU activation
+            cur = ggml_gelu(ctx0, cur);
+
+            // Projection
+            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
+        }
+
+        inpL = ggml_add(ctx0, cur, inpFF);
+    }
+
+    // Output Norm
+    {
+        cur = ggml_norm(ctx0, inpL, norm_eps);
+        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
+    }
+    ggml_set_name(cur, "result_norm");
+
+    cur = ggml_mul_mat(ctx0, model.output, cur);
+    ggml_set_name(cur, "result_output");
+
+    ggml_build_forward_expand(gf, cur);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
 static struct ggml_cgraph * llm_build_mpt(
          llama_context & lctx,
      const llama_batch & batch) {
@@ -5025,9 +5382,6 @@ static struct ggml_cgraph * llm_build_mpt(
     const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
     const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
-    //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
-    //        kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
-
     auto & buf_compute = lctx.buf_compute;
 
     struct ggml_init_params params = {
@@ -5348,6 +5702,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm_build_refact(lctx, batch);
             } break;
+        case LLM_ARCH_BLOOM:
+            {
+                result = llm_build_bloom(lctx, batch);
+            } break;
         case LLM_ARCH_MPT:
             {
                 result = llm_build_mpt(lctx, batch);
@@ -7579,8 +7937,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         const std::string name = ggml_get_name(meta);
 
         // TODO: avoid hardcoded tensor names - use the TN_* constants
-        if (name.find("attn_v.weight") != std::string::npos ||
-            name.find("attn_qkv.weight") != std::string::npos) {
+        if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
             ++n_attention_wv;
         }
         else if (name.find("ffn_down.weight") != std::string::npos) {

From c5b49360d0d9e49f32e05a9116e90bd0b39a282d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xingchen=20Song=28=E5=AE=8B=E6=98=9F=E8=BE=B0=29?=
 <xingchensong1996@163.com>
Date: Wed, 11 Oct 2023 00:28:50 +0800
Subject: [PATCH 19/26] readme : add bloom (#3570)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 0562795620e69..5c2b57cb4b83b 100644
--- a/README.md
+++ b/README.md
@@ -96,6 +96,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
+- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
 
 **Bindings:**
 

From 233fc1c69f6f415f35363e18a755f9610e89161b Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Tue, 10 Oct 2023 18:59:52 +0200
Subject: [PATCH 20/26] Minor improvements in GPT2 tokenizer (#3567)

* Fixing minor bugs in bpe_gpt2_preprocess

* Don't add bos token in test
---
 llama.cpp                         | 9 ++++-----
 tests/test-tokenizer-0-falcon.cpp | 8 ++++----
 tests/test-tokenizer-0-falcon.py  | 9 +++++----
 tests/test-tokenizer-0-llama.cpp  | 4 +---
 tests/test-tokenizer-0-llama.py   | 7 +++----
 5 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 4653c80232c5c..7ed8722376d2d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6342,7 +6342,6 @@ struct llm_tokenizer_bpe {
         for (int i = 0; i < (int)text_utf.size(); i++) {
             const std::string & utf_char = text_utf[i];
             bool split_condition = false;
-            // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
             int bytes_remain = text_utf.size() - i;
             // forward backward lookups
             const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
@@ -6368,9 +6367,9 @@ struct llm_tokenizer_bpe {
             if (!split_condition && bytes_remain >= 3) {
                 // 're|'ve|'ll
                 if (utf_char == "\'" && (
-                    (utf_char_next == "r" || utf_char_next_next == "e") ||
-                    (utf_char_next == "v" || utf_char_next_next == "e") ||
-                    (utf_char_next == "l" || utf_char_next_next == "l"))
+                    (utf_char_next == "r" && utf_char_next_next == "e") ||
+                    (utf_char_next == "v" && utf_char_next_next == "e") ||
+                    (utf_char_next == "l" && utf_char_next_next == "l"))
                     ) {
                     split_condition = true;
                 }
@@ -6421,7 +6420,7 @@ struct llm_tokenizer_bpe {
                 else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
                     split_condition = true;
                 }
-                else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
+                else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
                     split_condition = true;
                 }
             }
diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp
index 0f3c50bce8ae9..a4e9d2b912728 100644
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@@ -36,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
         { "   Hello"              , {     258,  23090, }, },
         { "    Hello"             , {     466,  23090, }, },
         { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
+        { "\n ="                  , {    1212,     40, }, },
+        { "' era"                 , {      18,   4932, }, },
     };
 
     return _k_tests;
@@ -155,7 +157,7 @@ int main(int argc, char **argv) {
 
         fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
 
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
+        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
 
         fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
 
@@ -169,10 +171,8 @@ int main(int argc, char **argv) {
             }
 
             for (const auto & tok : res) {
-                ofs << tok << " ";
+                ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
             }
-
-            ofs << "\n";
         }
 
         fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py
index 9c8c1c7d1d3ca..cf65a3f65d72c 100644
--- a/tests/test-tokenizer-0-falcon.py
+++ b/tests/test-tokenizer-0-falcon.py
@@ -41,6 +41,8 @@
         "   Hello",
         "    Hello",
         "    Hello\n    Hello",
+        "\n =",
+        "' era",
     ]
 
 for text in tests:
@@ -69,15 +71,14 @@
 if fname_tok:
     print('tokenizing file: ', fname_tok)
     fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r') as f:
+    with open(fname_tok, 'r', encoding='utf-8') as f:
         lines = f.readlines()
         s = ''.join(lines)
         res = tokenizer.encode(s)
         # write to file
-        with open(fname_out, 'w') as f:
+        with open(fname_out, 'w', encoding='utf-8') as f:
             for x in res:
-                f.write(str(x) + ' ')
-            f.write('\n')
+                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
         print('len(res): ', len(res))
         print('len(lines): ', len(lines))
     print('results written to: ', fname_out)
diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp
index 91c841f7bba8f..39c8d188c9086 100644
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@@ -174,10 +174,8 @@ int main(int argc, char **argv) {
             }
 
             for (const auto & tok : res) {
-                ofs << tok << " ";
+                ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
             }
-
-            ofs << "\n";
         }
 
         fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py
index bc164ee296cb1..078f680b165ca 100644
--- a/tests/test-tokenizer-0-llama.py
+++ b/tests/test-tokenizer-0-llama.py
@@ -81,15 +81,14 @@
 if fname_tok:
     print('tokenizing file: ', fname_tok)
     fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r') as f:
+    with open(fname_tok, 'r', encoding='utf-8') as f:
         lines = f.readlines()
         s = ''.join(lines)
         res = tokenizer.encode(s, add_bos=True)
         # write to file
-        with open(fname_out, 'w') as f:
+        with open(fname_out, 'w', encoding='utf-8') as f:
             for x in res:
-                f.write(str(x) + ' ')
-            f.write('\n')
+                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
         print('len(res): ', len(res))
         print('len(lines): ', len(lines))
     print('results written to: ', fname_out)

From 9f6ede19f3cfa50d4a51a5babb056c3f8a450b80 Mon Sep 17 00:00:00 2001
From: Galunid <karolek1231456@gmail.com>
Date: Wed, 11 Oct 2023 01:02:49 +0200
Subject: [PATCH 21/26] Add MPT model to supported models in README.md (#3574)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5c2b57cb4b83b..0f1fd75656926 100644
--- a/README.md
+++ b/README.md
@@ -97,6 +97,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
 - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
+- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
 
 **Bindings:**
 

From 24ba3d829e31a6eda3fa1723f692608c2fa3adda Mon Sep 17 00:00:00 2001
From: Zane Shannon <z@zcs.me>
Date: Wed, 11 Oct 2023 04:14:05 -0700
Subject: [PATCH 22/26] examples : add batched.swift + improve CI for swift
 (#3562)

---
 .github/workflows/build.yml               |   5 +
 Makefile                                  |   7 +-
 examples/batched.swift/.gitignore         |   9 +
 examples/batched.swift/Makefile           |   6 +
 examples/batched.swift/Package.swift      |  22 ++
 examples/batched.swift/README.md          |   4 +
 examples/batched.swift/Sources/main.swift | 255 ++++++++++++++++++++++
 7 files changed, 307 insertions(+), 1 deletion(-)
 create mode 100644 examples/batched.swift/.gitignore
 create mode 100755 examples/batched.swift/Makefile
 create mode 100644 examples/batched.swift/Package.swift
 create mode 100644 examples/batched.swift/README.md
 create mode 100644 examples/batched.swift/Sources/main.swift

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e41be76db0820..5af497a3ce321 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -276,6 +276,11 @@ jobs:
         run: |
           xcodebuild -scheme llama -destination "${{ matrix.destination }}"
 
+      - name: Build Swift Example
+        id: make_build_swift_example
+        run: |
+            make swift
+
   windows-latest-cmake:
     runs-on: windows-latest
 
diff --git a/Makefile b/Makefile
index 40187c4a25e62..87e7bb604c0c8 100644
--- a/Makefile
+++ b/Makefile
@@ -617,6 +617,11 @@ metal: examples/metal/metal.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif
 
+ifeq ($(UNAME_S),Darwin)
+swift: examples/batched.swift
+	(cd examples/batched.swift; make build)
+endif
+
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh $(CC) > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
@@ -637,7 +642,7 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 run-benchmark-matmult: benchmark-matmult
 	./$@
 
-.PHONY: run-benchmark-matmult
+.PHONY: run-benchmark-matmult swift
 
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
diff --git a/examples/batched.swift/.gitignore b/examples/batched.swift/.gitignore
new file mode 100644
index 0000000000000..e1e863bec6d5d
--- /dev/null
+++ b/examples/batched.swift/.gitignore
@@ -0,0 +1,9 @@
+.DS_Store
+/.build
+/Packages
+xcuserdata/
+DerivedData/
+.swiftpm/configuration/registries.json
+.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
+.netrc
+batched_swift
diff --git a/examples/batched.swift/Makefile b/examples/batched.swift/Makefile
new file mode 100755
index 0000000000000..2afb24fb85a1a
--- /dev/null
+++ b/examples/batched.swift/Makefile
@@ -0,0 +1,6 @@
+.PHONY: build
+
+build:
+	xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
+	rm -f ./batched_swift
+	ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
diff --git a/examples/batched.swift/Package.swift b/examples/batched.swift/Package.swift
new file mode 100644
index 0000000000000..826491defd863
--- /dev/null
+++ b/examples/batched.swift/Package.swift
@@ -0,0 +1,22 @@
+// swift-tools-version: 5.5
+// The swift-tools-version declares the minimum version of Swift required to build this package.
+
+import PackageDescription
+
+let package = Package(
+    name: "batched_swift",
+    platforms: [.macOS(.v12)],
+    dependencies: [
+        .package(name: "llama", path: "../../"),
+    ],
+    targets: [
+        // Targets are the basic building blocks of a package, defining a module or a test suite.
+        // Targets can depend on other targets in this package and products from dependencies.
+        .executableTarget(
+            name: "batched_swift",
+            dependencies: ["llama"],
+            path: "Sources",
+            linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
+        ),
+    ]
+)
diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md
new file mode 100644
index 0000000000000..464c9079c4660
--- /dev/null
+++ b/examples/batched.swift/README.md
@@ -0,0 +1,4 @@
+This is a swift clone of `examples/batched`.
+
+$ `make`
+$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
new file mode 100644
index 0000000000000..938f30512ca6a
--- /dev/null
+++ b/examples/batched.swift/Sources/main.swift
@@ -0,0 +1,255 @@
+import Foundation
+import llama
+
+let arguments = CommandLine.arguments
+
+// Check that we have at least one argument (the model path)
+guard arguments.count > 1 else {
+    print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
+    exit(1)
+}
+
+let modelPath: String = arguments[1]
+let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
+let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
+
+// total length of the sequences including the prompt
+let n_len: Int = 32
+
+// init LLM
+llama_backend_init(false)
+defer {
+    llama_backend_free()
+}
+
+let model_params = llama_model_default_params()
+guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
+    print("Failed to load model")
+    exit(1)
+}
+
+defer {
+    llama_free_model(model)
+}
+
+var tokens = tokenize(text: prompt, add_bos: true)
+
+let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
+
+var context_params = llama_context_default_params()
+context_params.seed = 1234
+context_params.n_ctx = n_kv_req
+context_params.n_batch = UInt32(max(n_len, n_parallel))
+context_params.n_threads = 8
+context_params.n_threads_batch = 8
+
+let context = llama_new_context_with_model(model, context_params)
+guard context != nil else {
+    print("Failed to initialize context")
+    exit(1)
+}
+
+defer {
+    llama_free(context)
+}
+
+let n_ctx = llama_n_ctx(context)
+
+print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
+
+if n_kv_req > n_ctx {
+    print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
+    exit(1)
+}
+
+var buffer: [CChar] = []
+for id: llama_token in tokens {
+    print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
+}
+
+print("\n")
+
+var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0)
+defer {
+    llama_batch_free(batch)
+}
+
+// evaluate the initial prompt
+batch.n_tokens = Int32(tokens.count)
+
+for (i, token) in tokens.enumerated() {
+    batch.token[i] = token
+    batch.pos[i] = Int32(i)
+    batch.seq_id[i] = 0
+    batch.logits[i] = 0
+}
+
+// llama_decode will output logits only for the last token of the prompt
+batch.logits[Int(batch.n_tokens) - 1] = 1
+
+if llama_decode(context, batch) != 0 {
+    print("llama_decode() failed")
+    exit(1)
+}
+
+for i in 1 ..< n_parallel {
+    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
+}
+
+if n_parallel > 1 {
+    print("generating \(n_parallel) sequences ...\n")
+}
+
+var streams: [String] = .init(repeating: "", count: n_parallel)
+var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
+var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
+
+var n_cur = batch.n_tokens
+var n_decode = 0
+
+let t_main_start = ggml_time_us()
+
+while n_cur <= n_len {
+    // prepare the next batch
+    batch.n_tokens = 0
+
+    // sample the next token for each parallel sequence / stream
+    for i in 0 ..< n_parallel {
+        if i_batch[i] < 0 {
+            // the stream has already finished
+            continue
+        }
+
+        var n_vocab = llama_n_vocab(model)
+        var logits = llama_get_logits_ith(context, i_batch[i])
+
+        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
+
+        for token_id in 0 ..< n_vocab {
+            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
+        }
+
+        var candidates_p: llama_token_data_array = .init(
+            data: &candidates,
+            size: candidates.count,
+            sorted: false
+        )
+
+        let top_k: Int32 = 40
+        let top_p: Float = 0.9
+        let temp: Float = 0.4
+
+        llama_sample_top_k(context, &candidates_p, top_k, 1)
+        llama_sample_top_p(context, &candidates_p, top_p, 1)
+        llama_sample_temp(context, &candidates_p, temp)
+
+        let new_token_id = llama_sample_token(context, &candidates_p)
+
+        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+
+        // is it an end of stream? -> mark the stream as finished
+        if new_token_id == llama_token_eos(context) || n_cur == n_len {
+            i_batch[i] = -1
+            // print("")
+            if n_parallel > 1 {
+                print("stream \(i) finished at n_cur = \(n_cur)")
+            }
+
+            continue
+        }
+
+        let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
+
+        // if there is only one stream, we print immediately to stdout
+        if n_parallel == 1 {
+            print(nextStringPiece, terminator: "")
+        }
+        streams[i] += nextStringPiece
+
+        // push this new token for next evaluation
+        batch.token[Int(batch.n_tokens)] = new_token_id
+        batch.pos[Int(batch.n_tokens)] = n_cur
+        batch.seq_id[Int(batch.n_tokens)] = Int32(i)
+        batch.logits[Int(batch.n_tokens)] = 1
+
+        i_batch[i] = batch.n_tokens
+
+        batch.n_tokens += 1
+
+        n_decode += 1
+    }
+
+    // all streams are finished
+    if batch.n_tokens == 0 {
+        break
+    }
+
+    n_cur += 1
+
+    // evaluate the current batch with the transformer model
+    if llama_decode(context, batch) != 0 {
+        print("llama_decode() failed")
+        exit(1)
+    }
+}
+
+if n_parallel > 1 {
+    print("\n")
+    for (i, stream) in streams.enumerated() {
+        print("sequence \(i):\n\n\(prompt)\(stream)\n")
+    }
+}
+
+let t_main_end = ggml_time_us()
+
+print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
+
+llama_print_timings(context)
+
+private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
+    let n_tokens = text.count + (add_bos ? 1 : 0)
+    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
+    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
+    var swiftTokens: [llama_token] = []
+    for i in 0 ..< tokenCount {
+        swiftTokens.append(tokens[Int(i)])
+    }
+    tokens.deallocate()
+    return swiftTokens
+}
+
+private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
+    var result = [CChar](repeating: 0, count: 8)
+    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
+    if nTokens < 0 {
+        if result.count >= -Int(nTokens) {
+            result.removeLast(-Int(nTokens))
+        } else {
+            result.removeAll()
+        }
+        let check = llama_token_to_piece(
+            model,
+            token,
+            &result,
+            Int32(result.count)
+        )
+        assert(check == nTokens)
+    } else {
+        result.removeLast(result.count - Int(nTokens))
+    }
+    if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
+        return utfString
+    } else {
+        buffer.append(contentsOf: result)
+        let data = Data(buffer.map { UInt8(bitPattern: $0) })
+        if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
+            buffer = []
+        }
+        guard let bufferString = String(data: data, encoding: .utf8) else {
+            return nil
+        }
+        buffer = []
+        return bufferString
+    }
+    return nil
+}

From 8c70a5ff25964f0a81e20d142a2f5ac5baff22fc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Oct 2023 21:25:33 +0300
Subject: [PATCH 23/26] batched : add bench tool (#3545)

* batched : add bench tool

* batched : minor fix table

* batched-bench : add readme + n_kv_max is now configurable

* batched-bench : init warm-up batch

* batched-bench : pass custom set of PP, TG and PL

* batched-bench : add mmq CLI arg
---
 .gitignore                               |   1 +
 Makefile                                 |  13 +-
 examples/CMakeLists.txt                  |   1 +
 examples/batched-bench/CMakeLists.txt    |   5 +
 examples/batched-bench/README.md         |  51 +++++
 examples/batched-bench/batched-bench.cpp | 251 +++++++++++++++++++++++
 examples/batched/batched.cpp             |   2 +-
 7 files changed, 321 insertions(+), 3 deletions(-)
 create mode 100644 examples/batched-bench/CMakeLists.txt
 create mode 100644 examples/batched-bench/README.md
 create mode 100644 examples/batched-bench/batched-bench.cpp

diff --git a/.gitignore b/.gitignore
index 420e0d6d016a2..d288e66fcc3d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,6 +55,7 @@ models-mnt
 /server
 /simple
 /batched
+/batched-bench
 /export-lora
 /finetune
 /speculative
diff --git a/Makefile b/Makefile
index 87e7bb604c0c8..571ad3bbec1fe 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,14 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
+BUILD_TARGETS = \
+	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
+	simple batched batched-bench save-load-state server embd-input-test gguf llama-bench baby-llama beam-search  \
+	speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
 
 # Binaries only useful for tests
-TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
+TEST_TARGETS = \
+	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
+	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
 
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -557,6 +563,9 @@ simple: examples/simple/simple.cpp                            build-info.h ggml.
 batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+batched-bench: examples/batched-bench/batched-bench.cpp       build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index de4cf7a691768..ab84593703caa 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -25,6 +25,7 @@ else()
     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(simple)
     add_subdirectory(batched)
+    add_subdirectory(batched-bench)
     add_subdirectory(speculative)
     add_subdirectory(parallel)
     add_subdirectory(embd-input)
diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt
new file mode 100644
index 0000000000000..40a032c514d11
--- /dev/null
+++ b/examples/batched-bench/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET batched-bench)
+add_executable(${TARGET} batched-bench.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md
new file mode 100644
index 0000000000000..34b343f66d6b9
--- /dev/null
+++ b/examples/batched-bench/README.md
@@ -0,0 +1,51 @@
+# llama.cpp/example/batched-bench
+
+Benchmark the batched decoding performance of `llama.cpp`
+
+## Usage
+
+There are 2 modes of operation:
+
+- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
+- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
+
+```bash
+./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
+
+# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
+./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
+
+# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
+
+# custom set of batches
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+```
+
+## Sample results
+
+- `PP` - prompt tokens per batch
+- `TG` - generated tokens per batch
+- `B` - number of batches
+- `N_KV` - required KV cache size
+- `T_PP` - prompt processing time (i.e. time to first token)
+- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
+- `T_TG` - time to generate all batches
+- `S_TG` - text generation speed (`(B*TG)/T_TG`)
+- `T` - total time
+- `S` - total speed (i.e. all tokens / total time)
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   128 |    128 |    1 |    256 |    0.108 |  1186.64 |    3.079 |    41.57 |    3.187 |    80.32 |
+|   128 |    128 |    2 |    512 |    0.198 |  1295.19 |    5.029 |    50.90 |    5.227 |    97.95 |
+|   128 |    128 |    4 |   1024 |    0.373 |  1373.96 |    6.878 |    74.44 |    7.251 |   141.23 |
+|   128 |    128 |    8 |   2048 |    0.751 |  1363.27 |    7.344 |   139.43 |    8.095 |   252.99 |
+|   128 |    128 |   16 |   4096 |    1.570 |  1304.68 |    8.455 |   242.23 |   10.024 |   408.60 |
+|   128 |    128 |   32 |   8192 |    3.408 |  1201.73 |    8.801 |   465.40 |   12.209 |   670.96 |
+|   128 |    256 |    1 |    384 |    0.107 |  1196.70 |    6.329 |    40.45 |    6.436 |    59.67 |
+|   128 |    256 |    2 |    768 |    0.194 |  1317.45 |   10.239 |    50.00 |   10.433 |    73.61 |
+|   128 |    256 |    4 |   1536 |    0.366 |  1399.03 |   13.960 |    73.35 |   14.326 |   107.22 |
+|   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
+|   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
+|   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
new file mode 100644
index 0000000000000..3e1e0716d8312
--- /dev/null
+++ b/examples/batched-bench/batched-bench.cpp
@@ -0,0 +1,251 @@
+#include "common.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+// mutates the input string
+static std::vector<int> parse_list(char * p) {
+    std::vector<int> ret;
+
+    char * q = p;
+
+    while (*p) {
+        if (*p == ',') {
+            *p = '\0';
+            ret.push_back(std::atoi(q));
+            q = p + 1;
+        }
+
+        ++p;
+    }
+
+    ret.push_back(std::atoi(q));
+
+    return ret;
+}
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    if (argc == 1 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
+        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
+        printf("  example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+        return 1 ;
+    }
+
+    int n_kv_max     = 2048;
+    int is_pp_shared = 0;
+    int n_gpu_layers = 0;
+    int mmq          = 0;
+
+    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
+    std::vector<int> n_tg = { 128, 256, };
+    std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
+    //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
+
+    if (argc >= 2) {
+        params.model = argv[1];
+    }
+
+    if (argc >= 3) {
+        n_kv_max = std::atoi(argv[2]);
+    }
+
+    if (argc >= 4) {
+        is_pp_shared = std::atoi(argv[3]);
+    }
+
+    if (argc >= 5) {
+        n_gpu_layers = std::atoi(argv[4]);
+    }
+
+    if (argc >= 6) {
+        mmq = std::atoi(argv[5]);
+    }
+
+    if (argc >= 7) {
+        n_pp = parse_list(argv[6]);
+    }
+
+    if (argc >= 8) {
+        n_tg = parse_list(argv[7]);
+    }
+
+    if (argc >= 9) {
+        n_pl = parse_list(argv[8]);
+    }
+
+    // init LLM
+
+    llama_backend_init(params.numa);
+
+    // initialize the model
+
+    llama_model_params model_params = llama_model_default_params();
+
+    model_params.n_gpu_layers = n_gpu_layers;
+
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    llama_context_params ctx_params = llama_context_default_params();
+
+    ctx_params.seed      = 1234;
+    ctx_params.n_ctx     = n_kv_max;
+    ctx_params.n_batch   = 512;
+    ctx_params.mul_mat_q = mmq;
+
+    ctx_params.n_threads       = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    llama_batch batch = llama_batch_init(n_kv_max, 0);
+
+    // decode in batches of ctx_params.n_batch tokens
+    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
+        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+
+            llama_batch batch_view = {
+                n_tokens,
+                batch.token  + i,
+                nullptr,
+                batch.pos    + i,
+                batch.seq_id + i,
+                batch.logits + i,
+                0, 0, 0, // unused
+            };
+
+            const int ret = llama_decode(ctx, batch_view);
+            if (ret != 0) {
+                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                return false;
+            }
+        }
+
+        return true;
+    };
+
+    // warm up
+    {
+        batch.n_tokens = 16;
+
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            batch.token[i]  = 0;
+            batch.pos[i]    = i;
+            batch.seq_id[i] = 0;
+            batch.logits[i] = false;
+        }
+
+        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            return 1;
+        }
+    }
+
+    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
+    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+
+    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
+        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
+            for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
+                const int pp = n_pp[i_pp];
+                const int tg = n_tg[i_tg];
+                const int pl = n_pl[i_pl];
+
+                const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
+
+                if (n_ctx_req > n_kv_max) {
+                    continue;
+                }
+
+                batch.n_tokens = is_pp_shared ? pp : pl*pp;
+
+                for (int i = 0; i < batch.n_tokens; ++i) {
+                    batch.token[i]  = 0;
+                    batch.pos[i]    = i;
+                    batch.seq_id[i] = 0;
+                    batch.logits[i] = false;
+                }
+                batch.logits[batch.n_tokens - 1] = true;
+
+                const auto t_pp_start = ggml_time_us();
+
+                llama_kv_cache_tokens_rm(ctx, -1, -1);
+
+                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    return 1;
+                }
+
+                if (is_pp_shared) {
+                    for (int32_t i = 1; i < pl; ++i) {
+                        llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
+                    }
+                }
+
+                const auto t_pp_end = ggml_time_us();
+
+                const auto t_tg_start = ggml_time_us();
+
+                for (int i = 0; i < tg; ++i) {
+                    batch.n_tokens = pl;
+
+                    for (int j = 0; j < pl; ++j) {
+                        batch.token[j]  = 0;
+                        batch.pos[j]    = pp + i;
+                        batch.seq_id[j] = j;
+                        batch.logits[j] = true;
+                    }
+
+                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+                        LOG_TEE("%s: llama_decode() failed\n", __func__);
+                        return 1;
+                    }
+                }
+
+                const auto t_tg_end = ggml_time_us();
+
+                const int32_t n_kv = n_ctx_req;
+
+                const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
+                const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
+                const float t    = t_pp + t_tg;
+
+                const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
+                const float speed_tg = pl*tg / t_tg;
+                const float speed    = n_kv / t;
+
+                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+            }
+        }
+    }
+
+    llama_print_timings(ctx);
+
+    llama_batch_free(batch);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    fprintf(stderr, "\n\n");
+
+    return 0;
+}
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 688ef221335a9..a88e022d69aec 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
     ctx_params.seed  = 1234;
     ctx_params.n_ctx = n_kv_req;
     ctx_params.n_batch = std::max(n_len, n_parallel);
-    ctx_params.n_threads = params.n_threads;
+    ctx_params.n_threads       = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);

From 70c29da118cdb02bfcbd0376c32b5b2236e48e48 Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Wed, 11 Oct 2023 13:35:46 -0600
Subject: [PATCH 24/26] common : fix mirostat state when using multiple
 sequences (#3543)

* Fix mirostat state when using multiple sequences

* Fix mirostat by completely refactoring sampling!

* Try to fix zig build.

* Export function to fetch/create default sampler states

Code formatting cleanups and add some comments

Silence a warning about id not being used when logging is disabled

* Apply some renaming suggestions.

Fix comments that were out of sync with the pull.

* Use more consistant naming convention for sampling contexts
---
 Makefile                                     |  86 ++++---
 build.zig                                    |   5 +-
 common/CMakeLists.txt                        |   2 +
 common/common.cpp                            | 228 +++++--------------
 common/common.h                              |  56 +----
 common/sampling.cpp                          | 166 ++++++++++++++
 common/sampling.h                            | 108 +++++++++
 examples/embd-input/embd-input-lib.cpp       |  19 +-
 examples/infill/infill.cpp                   |  18 +-
 examples/main/main.cpp                       |  18 +-
 examples/parallel/parallel.cpp               |   6 +-
 examples/save-load-state/save-load-state.cpp |   5 +-
 examples/server/server.cpp                   | 100 ++++----
 examples/speculative/speculative.cpp         |  12 +-
 14 files changed, 495 insertions(+), 334 deletions(-)
 create mode 100644 common/sampling.cpp
 create mode 100644 common/sampling.h

diff --git a/Makefile b/Makefile
index 571ad3bbec1fe..705fa1effc2fa 100644
--- a/Makefile
+++ b/Makefile
@@ -178,6 +178,24 @@ else
 	MK_CPPFLAGS += -DNDEBUG
 endif
 
+ifdef LLAMA_SANITIZE_THREAD
+	MK_CFLAGS   += -fsanitize=thread -g
+	MK_CXXFLAGS += -fsanitize=thread -g
+	MK_LDFLAGS  += -fsanitize=thread -g
+endif
+
+ifdef LLAMA_SANITIZE_ADDRESS
+	MK_CFLAGS   += -fsanitize=address -fno-omit-frame-pointer -g
+	MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
+	MK_LDFLAGS  += -fsanitize=address -fno-omit-frame-pointer -g
+endif
+
+ifdef LLAMA_SANITIZE_UNDEFINED
+	MK_CFLAGS   += -fsanitize=undefined -g
+	MK_CXXFLAGS += -fsanitize=undefined -g
+	MK_LDFLAGS  += -fsanitize=undefined -g
+endif
+
 ifdef LLAMA_SERVER_VERBOSE
 	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
 endif
@@ -526,7 +544,13 @@ OBJS += ggml-alloc.o ggml-backend.o
 llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-common.o: common/common.cpp common/common.h build-info.h common/log.h
+COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
+COMMON_DEPS   = $(COMMON_H_DEPS) common.o sampling.o
+
+common.o: common/common.cpp $(COMMON_H_DEPS)
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 console.o: common/console.cpp common/console.h
@@ -548,19 +572,19 @@ clean:
 # Examples
 #
 
-main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp                                  build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
-infill: examples/infill/infill.cpp                            build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
+infill: examples/infill/infill.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o common.o $(OBJS)
+simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o common.o $(OBJS)
+batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 batched-bench: examples/batched-bench/batched-bench.cpp       build-info.h ggml.o llama.o common.o $(OBJS)
@@ -572,53 +596,53 @@ quantize: examples/quantize/quantize.cpp                      build-info.h ggml.
 quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o llama.o common.o $(OBJS)
+perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o common.o $(OBJS)
+embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
 
-$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
 
 
-embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
 
 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS)
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o train.o $(OBJS)
+baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS)
+finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
+speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 ifdef LLAMA_METAL
@@ -659,40 +683,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 tests/test-c.o: tests/test-c.c llama.h
diff --git a/build.zig b/build.zig
index fdc5bc084eb46..0b74cee485320 100644
--- a/build.zig
+++ b/build.zig
@@ -128,17 +128,18 @@ pub fn build(b: *std.build.Builder) !void {
     const llama = make.obj("llama", "llama.cpp");
     const common = make.obj("common", "common/common.cpp");
     const console = make.obj("console", "common/console.cpp");
+    const sampling = make.obj("sampling", "common/sampling.cpp");
     const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
     const train = make.obj("train", "common/train.cpp");
 
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, console, grammar_parser });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
     _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
     _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
     _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
     _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
     _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
 
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, grammar_parser });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
     if (server.target.isWindows()) {
         server.linkSystemLibrary("ws2_32");
     }
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 951aa8340c7e4..fbb0ff0952ac7 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -5,6 +5,8 @@ set(TARGET common)
 add_library(${TARGET} OBJECT
     common.h
     common.cpp
+    sampling.h
+    sampling.cpp
     console.h
     console.cpp
     grammar-parser.h
diff --git a/common/common.cpp b/common/common.cpp
index 0f55c33a713a7..4214e63afd87a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -107,6 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     std::string arg;
     gpt_params default_params;
     const std::string arg_prefix = "--";
+    llama_sampling_params & sparams = params.sampling_params;
 
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -184,7 +185,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.top_k = std::stoi(argv[i]);
+            sparams.top_k = std::stoi(argv[i]);
         } else if (arg == "-c" || arg == "--ctx-size") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -216,73 +217,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.top_p = std::stof(argv[i]);
+            sparams.top_p = std::stof(argv[i]);
         } else if (arg == "--temp") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.temp = std::stof(argv[i]);
+            sparams.temp = std::stof(argv[i]);
         } else if (arg == "--tfs") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.tfs_z = std::stof(argv[i]);
+            sparams.tfs_z = std::stof(argv[i]);
         } else if (arg == "--typical") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.typical_p = std::stof(argv[i]);
+            sparams.typical_p = std::stof(argv[i]);
         } else if (arg == "--repeat-last-n") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.repeat_last_n = std::stoi(argv[i]);
+            sparams.repeat_last_n = std::stoi(argv[i]);
         } else if (arg == "--repeat-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.repeat_penalty = std::stof(argv[i]);
+            sparams.repeat_penalty = std::stof(argv[i]);
         } else if (arg == "--frequency-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.frequency_penalty = std::stof(argv[i]);
+            sparams.frequency_penalty = std::stof(argv[i]);
         } else if (arg == "--presence-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.presence_penalty = std::stof(argv[i]);
+            sparams.presence_penalty = std::stof(argv[i]);
         } else if (arg == "--mirostat") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.mirostat = std::stoi(argv[i]);
+            sparams.mirostat = std::stoi(argv[i]);
         } else if (arg == "--mirostat-lr") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.mirostat_eta = std::stof(argv[i]);
+            sparams.mirostat_eta = std::stof(argv[i]);
         } else if (arg == "--mirostat-ent") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.mirostat_tau = std::stof(argv[i]);
+            sparams.mirostat_tau = std::stof(argv[i]);
         } else if (arg == "--cfg-negative-prompt") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.cfg_negative_prompt = argv[i];
+            sparams.cfg_negative_prompt = argv[i];
         } else if (arg == "--cfg-negative-prompt-file") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -294,16 +295,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
-            if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
-                params.cfg_negative_prompt.pop_back();
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
+            if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
+                sparams.cfg_negative_prompt.pop_back();
             }
         } else if (arg == "--cfg-scale") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.cfg_scale = std::stof(argv[i]);
+            sparams.cfg_scale = std::stof(argv[i]);
         } else if (arg == "-b" || arg == "--batch-size") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -512,7 +513,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--no-penalize-nl") {
-            params.penalize_nl = false;
+            sparams.penalize_nl = false;
         } else if (arg == "-l" || arg == "--logit-bias") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -524,7 +525,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             std::string value_str;
             try {
                 if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                    params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                    sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
                 } else {
                     throw std::exception();
                 }
@@ -627,6 +628,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 }
 
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
+    const llama_sampling_params & sparams = params.sampling_params;
+
     printf("usage: %s [options]\n", argv[0]);
     printf("\n");
     printf("options:\n");
@@ -659,19 +662,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
-    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
-    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
-    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
+    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
+    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
+    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
+    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
+    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
+    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
+    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
     printf("  --mirostat N          use Mirostat sampling.\n");
     printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
-    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
-    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
+    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
+    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
+    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
     printf("  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
     printf("                        modifies the likelihood of token appearing in the completion,\n");
     printf("                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
@@ -682,7 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        negative prompt to use for guidance. (default: empty)\n");
     printf("  --cfg-negative-prompt-file FNAME\n");
     printf("                        negative prompt file to use for guidance. (default: empty)\n");
-    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
+    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
     printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
     printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
@@ -690,7 +693,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --no-penalize-nl      do not penalize newline token\n");
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
-    printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
+    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
     printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
     printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
     printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
@@ -840,7 +843,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     }
 
     if (params.ignore_eos) {
-        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+        params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
     }
 
     {
@@ -932,127 +935,6 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
     return result;
 }
 
-//
-// Sampling utils
-//
-
-llama_token llama_sample_token(
-                  struct llama_context * ctx,
-                  struct llama_context * ctx_guidance,
-                  struct llama_grammar * grammar,
-               const struct gpt_params & params,
-        const std::vector<llama_token> & last_tokens,
-         std::vector<llama_token_data> & candidates,
-                                   int   idx) {
-    const int n_ctx   = llama_n_ctx(ctx);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-    const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
-    const float   top_p           = params.top_p;
-    const float   tfs_z           = params.tfs_z;
-    const float   typical_p       = params.typical_p;
-    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-    const float   repeat_penalty  = params.repeat_penalty;
-    const float   alpha_presence  = params.presence_penalty;
-    const float   alpha_frequency = params.frequency_penalty;
-    const int     mirostat        = params.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
-    const bool    penalize_nl     = params.penalize_nl;
-
-    llama_token id = 0;
-
-    float * logits = llama_get_logits_ith(ctx, idx);
-
-    // Apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
-    }
-
-    candidates.clear();
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-    }
-
-    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
-
-    if (ctx_guidance) {
-        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
-    }
-
-    // apply penalties
-    if (!last_tokens.empty()) {
-        const float nl_logit = logits[llama_token_nl(ctx)];
-        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
-
-        llama_sample_repetition_penalty(ctx, &cur_p,
-                last_tokens.data() + last_tokens.size() - last_n_repeat,
-                last_n_repeat, repeat_penalty);
-        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
-                last_tokens.data() + last_tokens.size() - last_n_repeat,
-                last_n_repeat, alpha_frequency, alpha_presence);
-
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
-                }
-            }
-        }
-    }
-
-    if (grammar != NULL) {
-        llama_sample_grammar(ctx, &cur_p, grammar);
-    }
-
-    if (temp <= 0) {
-        // Greedy sampling
-        id = llama_sample_token_greedy(ctx, &cur_p);
-    } else {
-        if (mirostat == 1) {
-            static float mirostat_mu = 2.0f * mirostat_tau;
-            const int mirostat_m = 100;
-            llama_sample_temp(ctx, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-        } else if (mirostat == 2) {
-            static float mirostat_mu = 2.0f * mirostat_tau;
-            llama_sample_temp(ctx, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-        } else {
-            // Temperature sampling
-            size_t min_keep = std::max(1, params.n_probs);
-            llama_sample_top_k      (ctx, &cur_p, top_k, min_keep);
-            llama_sample_tail_free  (ctx, &cur_p, tfs_z, min_keep);
-            llama_sample_typical    (ctx, &cur_p, typical_p, min_keep);
-            llama_sample_top_p      (ctx, &cur_p, top_p, min_keep);
-            llama_sample_temp(ctx, &cur_p, temp);
-
-            {
-                const int n_top = 10;
-                LOG("top %d candidates:\n", n_top);
-
-                for (int i = 0; i < n_top; i++) {
-                    const llama_token id = cur_p.data[i].id;
-                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
-                }
-            }
-
-            id = llama_sample_token(ctx, &cur_p);
-
-            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
-        }
-    }
-    // printf("`%d`", candidates_p.size);
-
-    if (grammar != NULL) {
-        llama_grammar_accept_token(ctx, grammar, id);
-    }
-
-    return id;
-}
-
 //
 // YAML utils
 //
@@ -1204,6 +1086,8 @@ std::string get_sortable_timestamp() {
 
 void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
                                const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
+    const llama_sampling_params & sparams = params.sampling_params;
+
     fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
     fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
     fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
@@ -1250,21 +1134,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
 
     fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
     fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
-    dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
-    fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
+    dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
+    fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
     fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
     fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
     fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
     fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
     fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
-    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
+    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
     dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
     fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
     fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
     fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
 
-    const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
-    const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
+    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
+    const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
     fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
 
     dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
@@ -1277,7 +1161,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
 
     fprintf(stream, "logit_bias:\n");
-    for (std::pair<llama_token, float> lb : params.logit_bias) {
+    for (std::pair<llama_token, float> lb : sparams.logit_bias) {
         if (ignore_eos && lb.first == logit_bias_eos->first) {
             continue;
         }
@@ -1301,30 +1185,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
     fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
-    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
-    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
-    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
+    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
+    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
+    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
     fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
     fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
     fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
     fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
     fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
     fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
-    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
+    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
     fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
     fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
-    fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
+    fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
     fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
     fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
     fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
-    fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
+    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
     dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
     fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
     fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
     fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
     dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
     fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
-    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
+    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
 
     fprintf(stream, "reverse_prompt:\n");
     for (std::string ap : params.antiprompt) {
@@ -1342,15 +1226,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
     fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
     fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
-    fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
+    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
 
     const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
     dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
 
-    fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
+    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
     fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
-    fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
-    fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
-    fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
+    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
+    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
+    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
diff --git a/common/common.h b/common/common.h
index c802152791797..fa115536b64a0 100644
--- a/common/common.h
+++ b/common/common.h
@@ -4,6 +4,8 @@
 
 #include "llama.h"
 
+#include "sampling.h"
+
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
 
@@ -49,31 +51,12 @@ struct gpt_params {
     int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
-    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
     int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
     float   rope_freq_base                  = 0.0f; // RoPE base frequency
     float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
 
-    // sampling parameters
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typical_p         = 1.00f; // 1.0 = disabled
-    float   temp              = 0.80f; // 1.0 = disabled
-    float   repeat_penalty    = 1.10f; // 1.0 = disabled
-    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   frequency_penalty = 0.00f; // 0.0 = disabled
-    float   presence_penalty  = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-
-    // Classifier-Free Guidance
-    // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt;       // string to help guidance
-    float       cfg_scale         = 1.f;   // How strong is guidance
+    // // sampling parameters
+    struct llama_sampling_params sampling_params;
 
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
     std::string model_draft       = "";                              // draft model for speculative decoding
@@ -115,7 +98,6 @@ struct gpt_params {
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens
     bool instruct          = false; // instruction mode (used for Alpaca models)
-    bool penalize_nl       = true;  // consider newlines as a repeatable token
     bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
@@ -180,36 +162,6 @@ std::string llama_detokenize_bpe(
                          llama_context * ctx,
         const std::vector<llama_token> & tokens);
 
-//
-// Sampling utils
-//
-
-// this is a common sampling function used across the examples for convenience
-// it can serve as a starting point for implementing your own sampling function
-//
-// required:
-//  - ctx:    context to use for sampling
-//  - params: sampling parameters
-//
-// optional:
-//  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
-//  - grammar:       grammar to use for sampling, ignore if NULL
-//  - last_tokens:   needed for repetition penalty, ignore if empty
-//  - idx:           sample from llama_get_logits_ith(ctx, idx)
-//
-// returns:
-//  - token:      sampled token
-//  - candidates: vector of candidate tokens
-//
-llama_token llama_sample_token(
-                  struct llama_context * ctx,
-                  struct llama_context * ctx_guidance,
-                  struct llama_grammar * grammar,
-               const struct gpt_params & params,
-        const std::vector<llama_token> & last_tokens,
-         std::vector<llama_token_data> & candidates,
-                                   int   idx = 0);
-
 //
 // YAML utils
 //
diff --git a/common/sampling.cpp b/common/sampling.cpp
new file mode 100644
index 0000000000000..8ce4194593ca7
--- /dev/null
+++ b/common/sampling.cpp
@@ -0,0 +1,166 @@
+#include "sampling.h"
+
+llama_sampling_context::~llama_sampling_context() {
+    for (auto & it : sequence_contexts) {
+        if (it.second.grammar != NULL) {
+            llama_grammar_free(it.second.grammar);
+            it.second.grammar = NULL;
+        }
+    }
+}
+
+llama_sampling_context llama_sampling_context_init(
+        const struct gpt_params & params,
+                  llama_grammar * grammar) {
+  llama_sampling_context result;
+
+  result.params = params.sampling_params;
+  result.grammar = grammar;
+  return result;
+}
+
+// Note: Creates the context if it doesn't exist, so this always return something.
+llama_sampler_sequence_context & llama_sampling_get_sequence_context(
+              llama_sampling_context & ctx_sampling,
+        const llama_seq_id             seq) {
+    const auto it = ctx_sampling.sequence_contexts.find(seq);
+    if (it != ctx_sampling.sequence_contexts.end()) {
+        return it->second;
+    }
+    llama_sampler_sequence_context new_ctx = {
+        2.0f * ctx_sampling.params.mirostat_tau,
+        ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
+    };
+    return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
+}
+
+bool llama_sampling_context_reset(
+              llama_sampling_context & ctx_sampling,
+        const llama_seq_id             seq) {
+    const auto it = ctx_sampling.sequence_contexts.find(seq);
+    if (it == ctx_sampling.sequence_contexts.end()) return false;
+    if (it->second.grammar != NULL) {
+        llama_grammar_free(it->second.grammar);
+        it->second.grammar = NULL;
+    }
+    ctx_sampling.sequence_contexts.erase(it);
+    return true;
+}
+
+llama_token llama_sampling_sample(
+                  struct llama_context * ctx,
+                  struct llama_context * ctx_guidance,
+                  struct llama_sampling_context & ctx_sampling,
+        const std::vector<llama_token> & last_tokens,
+         std::vector<llama_token_data> & candidates,
+        const                      int   idx,
+                          llama_seq_id   seq) {
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
+    const llama_sampling_params & params = ctx_sampling.params;
+    const float   temp            = params.temp;
+    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
+    const float   top_p           = params.top_p;
+    const float   tfs_z           = params.tfs_z;
+    const float   typical_p       = params.typical_p;
+    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+    const float   repeat_penalty  = params.repeat_penalty;
+    const float   alpha_presence  = params.presence_penalty;
+    const float   alpha_frequency = params.frequency_penalty;
+    const int     mirostat        = params.mirostat;
+    const float   mirostat_tau    = params.mirostat_tau;
+    const float   mirostat_eta    = params.mirostat_eta;
+    const bool    penalize_nl     = params.penalize_nl;
+
+    llama_token id = 0;
+
+    float * logits = llama_get_logits_ith(ctx, idx);
+
+    // Apply params.logit_bias map
+    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+        logits[it->first] += it->second;
+    }
+
+    candidates.clear();
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+    }
+
+    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
+
+    if (ctx_guidance) {
+        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
+    }
+
+    // apply penalties
+    if (!last_tokens.empty()) {
+        const float nl_logit = logits[llama_token_nl(ctx)];
+        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
+
+        llama_sample_repetition_penalty(ctx, &cur_p,
+                last_tokens.data() + last_tokens.size() - last_n_repeat,
+                last_n_repeat, repeat_penalty);
+        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
+                last_tokens.data() + last_tokens.size() - last_n_repeat,
+                last_n_repeat, alpha_frequency, alpha_presence);
+
+        if (!penalize_nl) {
+            for (size_t idx = 0; idx < cur_p.size; idx++) {
+                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
+                    cur_p.data[idx].logit = nl_logit;
+                    break;
+                }
+            }
+        }
+    }
+
+    llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
+
+    if (ctx_seq.grammar != NULL) {
+        llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
+    }
+
+    if (temp <= 0) {
+        // Greedy sampling
+        id = llama_sample_token_greedy(ctx, &cur_p);
+    } else {
+        if (mirostat == 1) {
+            const int mirostat_m = 100;
+            llama_sample_temp(ctx, &cur_p, temp);
+            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
+        } else if (mirostat == 2) {
+            llama_sample_temp(ctx, &cur_p, temp);
+            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
+        } else {
+            // Temperature sampling
+            size_t min_keep = std::max(1, params.n_probs);
+            llama_sample_top_k      (ctx, &cur_p, top_k, min_keep);
+            llama_sample_tail_free  (ctx, &cur_p, tfs_z, min_keep);
+            llama_sample_typical    (ctx, &cur_p, typical_p, min_keep);
+            llama_sample_top_p      (ctx, &cur_p, top_p, min_keep);
+            llama_sample_temp(ctx, &cur_p, temp);
+
+            {
+                const int n_top = 10;
+                LOG("top %d candidates:\n", n_top);
+
+                for (int i = 0; i < n_top; i++) {
+                    const llama_token id = cur_p.data[i].id;
+                    (void)id; // To avoid a warning that id is unused when logging is disabled.
+                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
+                }
+            }
+
+            id = llama_sample_token(ctx, &cur_p);
+
+            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
+        }
+    }
+
+    if (ctx_seq.grammar != NULL) {
+        llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
+    }
+
+    return id;
+}
diff --git a/common/sampling.h b/common/sampling.h
new file mode 100644
index 0000000000000..0aab5d03c2f61
--- /dev/null
+++ b/common/sampling.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include "llama.h"
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+// sampling parameters
+typedef struct llama_sampling_params {
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typical_p         = 1.00f; // 1.0 = disabled
+    float   temp              = 0.80f; // 1.0 = disabled
+    float   repeat_penalty    = 1.10f; // 1.0 = disabled
+    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   frequency_penalty = 0.00f; // 0.0 = disabled
+    float   presence_penalty  = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+
+    bool    penalize_nl       = true;  // consider newlines as a repeatable token
+
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+
+    // Classifier-Free Guidance
+    // https://arxiv.org/abs/2306.17806
+    std::string cfg_negative_prompt;   // string to help guidance
+    float       cfg_scale     = 1.f;   // How strong is guidance
+
+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+
+} llama_sampling_params;
+
+// per-sequence sampler context
+typedef struct llama_sampler_sequence_context {
+    float mirostat_mu; // mirostat sampler state
+    llama_grammar * grammar;
+} llama_sampler_sequence_context;
+
+// general sampler context
+typedef struct llama_sampling_context {
+    ~llama_sampling_context();
+
+    // parameters that will be used for sampling and when creating
+    // new llama_sampler_sequence_context instances
+    llama_sampling_params params;
+
+    // map of sequence ids to sampler contexts
+    std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
+
+    // when non-NULL, new instances of llama_sampler_sequence_context
+    // will get a copy of the grammar here
+    // note: only the pointer is stored here, it is not a copy of
+    //       the grammar and shouldn't be freed
+    llama_grammar * grammar;
+} llama_sampling_context;
+
+#include "common.h"
+
+// Create a new sampling context instance.
+llama_sampling_context llama_sampling_context_init(
+        const struct gpt_params & params,
+                  llama_grammar * grammar = NULL);
+
+// Fetches the sampler context for the specified sequence id (defaults to 0).
+// If the context for that sequence id doesn't already exist, it will be created with
+// default values based on the parameters in the ctx_sampling argument.
+llama_sampler_sequence_context & llama_sampling_get_sequence_context(
+              llama_sampling_context & ctx_sampling,
+        const llama_seq_id             seq = 0);
+
+// Reset the sampler context for the supplied sequence id (defaults to 0).
+// This is necessary to reuse a sequence id or free memory used by sequences
+// that are no longer required.
+bool llama_sampling_context_reset(
+              llama_sampling_context & ctx_sampling,
+        const llama_seq_id             seq = 0);
+
+// this is a common sampling function used across the examples for convenience
+// it can serve as a starting point for implementing your own sampling function
+// Note: When using multiple sequences, it is the caller's responsibility to call
+//       llama_sampling_context_reset when a sequence ends
+//
+// required:
+//  - ctx:          context to use for sampling
+//  - ctx_sampling: sampling-specific context
+//
+// optional:
+//  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
+//  - last_tokens:   needed for repetition penalty, ignore if empty
+//  - idx:           sample from llama_get_logits_ith(ctx, idx)
+//  - seq:           sequence id to associate sampler state with
+//
+// returns:
+//  - token:      sampled token
+//  - candidates: vector of candidate tokens
+//
+llama_token llama_sampling_sample(
+                  struct llama_context * ctx,
+                  struct llama_context * ctx_guidance,
+                  struct llama_sampling_context & ctx_sampling,
+        const std::vector<llama_token> & last_tokens,
+         std::vector<llama_token_data> & candidates,
+        const                      int   idx = 0,
+                          llama_seq_id   seq = 0);
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index 99e6bdad5ac45..87a5a1c26f88b 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -128,21 +128,22 @@ bool eval_string(struct MyModel * mymodel,const char* str){
 llama_token sampling_id(struct MyModel* mymodel) {
     llama_context* ctx = mymodel->ctx;
     gpt_params params = mymodel->params;
+    llama_sampling_params & sparams = params.sampling_params;
     // int n_ctx = llama_n_ctx(ctx);
 
     // out of user input, sample next token
-    const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
-    const float   top_p           = params.top_p;
-    const float   tfs_z           = params.tfs_z;
-    const float   typical_p       = params.typical_p;
+    const float   temp            = sparams.temp;
+    const int32_t top_k           = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
+    const float   top_p           = sparams.top_p;
+    const float   tfs_z           = sparams.tfs_z;
+    const float   typical_p       = sparams.typical_p;
     // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
     // const float   repeat_penalty  = params.repeat_penalty;
     // const float   alpha_presence  = params.presence_penalty;
     // const float   alpha_frequency = params.frequency_penalty;
-    const int     mirostat        = params.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
+    const int     mirostat        = sparams.mirostat;
+    const float   mirostat_tau    = sparams.mirostat_tau;
+    const float   mirostat_eta    = sparams.mirostat_eta;
     // const bool    penalize_nl     = params.penalize_nl;
 
     llama_token id = 0;
@@ -151,7 +152,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
         auto n_vocab = llama_n_vocab(llama_get_model(ctx));
 
         // Apply params.logit_bias map
-        for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
             logits[it->first] += it->second;
         }
 
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index d994de5e850c3..187623f5d8424 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -104,6 +104,7 @@ static void sigint_handler(int signo) {
 
 int main(int argc, char ** argv) {
     gpt_params params;
+    llama_sampling_params & sparams = params.sampling_params;
     g_params = &params;
 
     if (!gpt_params_parse(argc, argv, params)) {
@@ -206,7 +207,7 @@ int main(int argc, char ** argv) {
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (params.cfg_scale > 1.f) {
+    if (sparams.cfg_scale > 1.f) {
         struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
         ctx_guidance = llama_new_context_with_model(model, lparams);
     }
@@ -269,9 +270,9 @@ int main(int argc, char ** argv) {
     int guidance_offset = 0;
     int original_prompt_len = 0;
     if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
 
-        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
         LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
 
         std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
@@ -312,7 +313,7 @@ int main(int argc, char ** argv) {
 
         if (ctx_guidance) {
             LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
             LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
                 LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@@ -358,7 +359,7 @@ int main(int argc, char ** argv) {
         }
     }
     LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
+            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
 
@@ -376,8 +377,8 @@ int main(int argc, char ** argv) {
         LOG_TEE("\n");
 
         {
-            auto it = params.logit_bias.find(llama_token_eos(ctx));
-            if (it != params.logit_bias.end() && it->second == -INFINITY) {
+            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
+            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
                 LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
             }
         }
@@ -434,6 +435,7 @@ int main(int argc, char ** argv) {
 
     const int n_vocab = llama_n_vocab(model);
 
+    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
     std::vector<llama_token_data> candidates;
     candidates.reserve(n_vocab);
 
@@ -552,7 +554,7 @@ int main(int argc, char ** argv) {
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
 
-            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
 
             last_tokens.erase(last_tokens.begin());
             last_tokens.push_back(id);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 775a5a201e5b8..b39a67d979c88 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -109,6 +109,7 @@ int main(int argc, char ** argv) {
     if (!gpt_params_parse(argc, argv, params)) {
         return 1;
     }
+    llama_sampling_params & sparams = params.sampling_params;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("main", "log"));
@@ -179,7 +180,7 @@ int main(int argc, char ** argv) {
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (params.cfg_scale > 1.f) {
+    if (sparams.cfg_scale > 1.f) {
         struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
         ctx_guidance = llama_new_context_with_model(model, lparams);
     }
@@ -257,9 +258,9 @@ int main(int argc, char ** argv) {
     int guidance_offset = 0;
     int original_prompt_len = 0;
     if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
 
-        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
         LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
 
         std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
@@ -343,7 +344,7 @@ int main(int argc, char ** argv) {
 
         if (ctx_guidance) {
             LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
             LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
                 LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@@ -395,7 +396,7 @@ int main(int argc, char ** argv) {
         }
     }
     LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
+            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
 
@@ -413,8 +414,8 @@ int main(int argc, char ** argv) {
         LOG_TEE("\n");
 
         {
-            auto it = params.logit_bias.find(llama_token_eos(ctx));
-            if (it != params.logit_bias.end() && it->second == -INFINITY) {
+            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
+            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
                 LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
             }
         }
@@ -469,6 +470,7 @@ int main(int argc, char ** argv) {
 
     const int n_vocab = llama_n_vocab(model);
 
+    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
     std::vector<llama_token_data> candidates;
     candidates.reserve(n_vocab);
 
@@ -625,7 +627,7 @@ int main(int argc, char ** argv) {
                 LOG("saved session to %s\n", path_session.c_str());
             }
 
-            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
 
             last_tokens.erase(last_tokens.begin());
             last_tokens.push_back(id);
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 04f1e45b955dd..63ddcd8ed8857 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -125,6 +125,8 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
 
+    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
+
     // load the prompts from an external file if there are any
     if (params.prompt.empty()) {
         printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
@@ -339,7 +341,7 @@ int main(int argc, char ** argv) {
                 //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                 //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
 
-                const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
+                const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
 
                 if (client.n_decoded == 1) {
                     // start measuring generation time after the first token to make sure all concurrent clients
@@ -384,7 +386,7 @@ int main(int argc, char ** argv) {
 
                     n_total_prompt += client.n_prompt;
                     n_total_gen    += client.n_decoded;
-
+                    llama_sampling_context_reset(ctx_sampling, client.seq_id);
                     client.seq_id = -1;
                 }
 
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index acc6dbdfd07d0..f9e3c98a38a40 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -8,9 +8,10 @@
 
 int main(int argc, char ** argv) {
     gpt_params params;
+    llama_sampling_params & sparams = params.sampling_params;
     params.seed = 42;
     params.n_threads = 4;
-    params.repeat_last_n = 64;
+    sparams.repeat_last_n = 64;
     params.prompt = "The quick brown fox";
 
     if (!gpt_params_parse(argc, argv, params)) {
@@ -24,7 +25,7 @@ int main(int argc, char ** argv) {
     }
 
     auto n_past = 0;
-    auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
+    auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
 
     // init
     llama_model * model;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8c5318c650ae8..58af78de96cb5 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -200,6 +200,7 @@ struct llama_server_context
     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
     gpt_params params;
+    llama_sampling_context ctx_sampling;
     int n_ctx;
 
     grammar_parser::parse_state parsed_grammar;
@@ -254,6 +255,7 @@ struct llama_server_context
         if (grammar != nullptr) {
             llama_grammar_free(grammar);
             grammar = nullptr;
+            ctx_sampling = llama_sampling_context_init(params, NULL);
         }
     }
 
@@ -329,8 +331,8 @@ struct llama_server_context
             grammar_parser::print_grammar(stderr, parsed_grammar);
 
             {
-                auto it = params.logit_bias.find(llama_token_eos(ctx));
-                if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
+                if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) {
                     LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
                 }
             }
@@ -339,6 +341,7 @@ struct llama_server_context
             grammar = llama_grammar_init(
                 grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
         }
+        ctx_sampling = llama_sampling_context_init(params, grammar);
         return true;
     }
 
@@ -550,12 +553,12 @@ struct llama_server_context
             std::vector<llama_token_data> candidates;
             candidates.reserve(llama_n_vocab(model));
 
-            result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
+            result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates);
 
             llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
-            const int32_t n_probs = params.n_probs;
-            if (params.temp <= 0 && n_probs > 0)
+            const int32_t n_probs = params.sampling_params.n_probs;
+            if (params.sampling_params.temp <= 0 && n_probs > 0)
             {
                 // For llama_sample_token_greedy we need to sort candidates
                 llama_sample_softmax(ctx, &candidates_p);
@@ -630,7 +633,7 @@ struct llama_server_context
         const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
         generated_text += token_text;
 
-        if (params.n_probs > 0)
+        if (params.sampling_params.n_probs > 0)
         {
             generated_token_probs.push_back(token_with_probs);
         }
@@ -1018,34 +1021,35 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 
 static json format_generation_settings(llama_server_context &llama)
 {
-    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
-    const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
+    const auto & sparams = llama.params.sampling_params;
+    const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
+    const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
                             eos_bias->second < 0.0f && std::isinf(eos_bias->second);
 
     return json{
         {"n_ctx", llama.n_ctx},
         {"model", llama.params.model_alias},
         {"seed", llama.params.seed},
-        {"temp", llama.params.temp},
-        {"top_k", llama.params.top_k},
-        {"top_p", llama.params.top_p},
-        {"tfs_z", llama.params.tfs_z},
-        {"typical_p", llama.params.typical_p},
-        {"repeat_last_n", llama.params.repeat_last_n},
-        {"repeat_penalty", llama.params.repeat_penalty},
-        {"presence_penalty", llama.params.presence_penalty},
-        {"frequency_penalty", llama.params.frequency_penalty},
-        {"mirostat", llama.params.mirostat},
-        {"mirostat_tau", llama.params.mirostat_tau},
-        {"mirostat_eta", llama.params.mirostat_eta},
-        {"penalize_nl", llama.params.penalize_nl},
+        {"temp", sparams.temp},
+        {"top_k", sparams.top_k},
+        {"top_p", sparams.top_p},
+        {"tfs_z", sparams.tfs_z},
+        {"typical_p", sparams.typical_p},
+        {"repeat_last_n", sparams.repeat_last_n},
+        {"repeat_penalty", sparams.repeat_penalty},
+        {"presence_penalty", sparams.presence_penalty},
+        {"frequency_penalty", sparams.frequency_penalty},
+        {"mirostat", sparams.mirostat},
+        {"mirostat_tau", sparams.mirostat_tau},
+        {"mirostat_eta", sparams.mirostat_eta},
+        {"penalize_nl", sparams.penalize_nl},
         {"stop", llama.params.antiprompt},
         {"n_predict", llama.params.n_predict},
         {"n_keep", llama.params.n_keep},
         {"ignore_eos", ignore_eos},
         {"stream", llama.stream},
-        {"logit_bias", llama.params.logit_bias},
-        {"n_probs", llama.params.n_probs},
+        {"logit_bias", sparams.logit_bias},
+        {"n_probs", sparams.n_probs},
         {"grammar", llama.params.grammar},
     };
 }
@@ -1094,7 +1098,7 @@ static json format_final_response(llama_server_context &llama, const std::string
         {"timings", format_timings(llama)},
     };
 
-    if (llama.params.n_probs > 0)
+    if (llama.params.sampling_params.n_probs > 0)
     {
         res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
     }
@@ -1110,7 +1114,7 @@ static json format_partial_response(
         {"stop", false},
     };
 
-    if (llama.params.n_probs > 0)
+    if (llama.params.sampling_params.n_probs > 0)
     {
         res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
     }
@@ -1142,26 +1146,28 @@ static T json_value(const json &body, const std::string &key, const T &default_v
 static void parse_options_completion(const json &body, llama_server_context &llama)
 {
     gpt_params default_params;
+    const auto & default_sparams = default_params.sampling_params;
+    auto & sparams = llama.params.sampling_params;
 
     llama.stream = json_value(body, "stream", false);
     llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
-    llama.params.top_k = json_value(body, "top_k", default_params.top_k);
-    llama.params.top_p = json_value(body, "top_p", default_params.top_p);
-    llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
-    llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
-    llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
-    llama.params.temp = json_value(body, "temperature", default_params.temp);
-    llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
-    llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
-    llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
-    llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
-    llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
-    llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
-    llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
+    sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
+    sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
+    sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
+    sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
+    sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
+    sparams.temp = json_value(body, "temperature", default_sparams.temp);
+    sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
+    sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
+    sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
+    sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
+    sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
+    sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
+    sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
     llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
     llama.params.seed = json_value(body, "seed", default_params.seed);
     llama.params.grammar = json_value(body, "grammar", default_params.grammar);
-    llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
+    sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
 
     if (body.count("prompt") != 0)
     {
@@ -1172,10 +1178,10 @@ static void parse_options_completion(const json &body, llama_server_context &lla
         llama.prompt = "";
     }
 
-    llama.params.logit_bias.clear();
+    sparams.logit_bias.clear();
     if (json_value(body, "ignore_eos", false))
     {
-        llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
+        sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
     }
 
     const auto &logit_bias = body.find("logit_bias");
@@ -1191,11 +1197,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla
                 {
                     if (el[1].is_number())
                     {
-                        llama.params.logit_bias[tok] = el[1].get<float>();
+                        sparams.logit_bias[tok] = el[1].get<float>();
                     }
                     else if (el[1].is_boolean() && !el[1].get<bool>())
                     {
-                        llama.params.logit_bias[tok] = -INFINITY;
+                        sparams.logit_bias[tok] = -INFINITY;
                     }
                 }
             }
@@ -1215,6 +1221,8 @@ static void parse_options_completion(const json &body, llama_server_context &lla
         }
     }
 
+    llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
+
     LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
 }
 
@@ -1423,7 +1431,7 @@ int main(int argc, char **argv)
             }
 
             auto probs = llama.generated_token_probs;
-            if (llama.params.n_probs > 0 && llama.stopped_word) {
+            if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
                 const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
                 probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
             }
@@ -1475,7 +1483,7 @@ int main(int argc, char **argv)
 
                         std::vector<completion_token_output> probs_output = {};
 
-                        if (llama.params.n_probs > 0) {
+                        if (llama.params.sampling_params.n_probs > 0) {
                             const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                             size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                             size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@@ -1596,7 +1604,7 @@ int main(int argc, char **argv)
 
                     std::vector<completion_token_output> probs_output = {};
 
-                    if (llama.params.n_probs > 0) {
+                    if (llama.params.sampling_params.n_probs > 0) {
                         const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                         size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                         size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 75a2e5e22d046..018dbf9a205b9 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -125,6 +125,8 @@ int main(int argc, char ** argv) {
         grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
     }
 
+    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
+
     const auto t_dec_start = ggml_time_us();
 
     while (true) {
@@ -134,7 +136,7 @@ int main(int argc, char ** argv) {
 
         while (true) {
             // sample from the target model
-            llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
+            llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
 
             // remember which tokens were sampled - used for repetition penalties during sampling
             last_tokens.erase(last_tokens.begin());
@@ -211,7 +213,13 @@ int main(int argc, char ** argv) {
             if (grammar_dft) {
                 llama_grammar_free(grammar_dft);
             }
-            grammar_dft = llama_grammar_copy(grammar_tgt);
+            // Note: Hardcoded to sequence id 0, if this ever supports parallel generation
+            //       that will need to change.
+            auto it = ctx_sampling.sequence_contexts.find(0);
+            GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
+            // This is necessary because each sequence id in sequence_contexts
+            // uses a copy of the original grammar.
+            grammar_dft = llama_grammar_copy(it->second.grammar);
 
             LOG("copied target grammar to draft grammar\n");
         }

From a8bdd65525ae86dea905e9866ad369b53e30ac14 Mon Sep 17 00:00:00 2001
From: Michael Coppola <m18coppola@gmail.com>
Date: Wed, 11 Oct 2023 15:42:22 -0400
Subject: [PATCH 25/26] server : add parameter -tb N, --threads-batch N (#3584)

Co-authored-by: Michael Coppola <info@michaeljcoppola.com>
---
 examples/server/server.cpp | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 58af78de96cb5..d992feeef7026 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -714,15 +714,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("usage: %s [options]\n", argv0);
     printf("\n");
     printf("options:\n");
-    printf("  -h, --help            show this help message and exit\n");
-    printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
-    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    printf("  -h, --help                show this help message and exit\n");
+    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
+    printf("  -t N,  --threads N        number of threads to use during computation (default: %d)\n", params.n_threads);
+    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
+    printf("  -c N,  --ctx-size N       size of the prompt context (default: %d)\n", params.n_ctx);
+    printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
+    printf("  --rope-freq-scale N       RoPE frequency scaling factor (default: loaded from model)\n");
+    printf("  -b N,  --batch-size N     batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
+    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
     if (llama_mlock_supported())
     {
         printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
@@ -867,6 +868,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.n_threads = std::stoi(argv[i]);
         }
+        else if (arg == "--threads-batch" || arg == "-tb")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads_batch = std::stoi(argv[i]);
+        }
         else if (arg == "-b" || arg == "--batch-size")
         {
             if (++i >= argc)

From b8fe4b5cc9cb237ca98e5bc51b5d189e3c446d13 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Oct 2023 23:55:08 +0300
Subject: [PATCH 26/26] main : fix session loading bug (#3400)

---
 examples/main/main.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index b39a67d979c88..55f73356fb89a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -297,6 +297,9 @@ int main(int argc, char ** argv) {
             LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
                 __func__, n_matching_session_tokens, embd_inp.size());
         }
+
+        // remove any "future" tokens that we might have inherited from the previous session
+        llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
     }
 
     LOGLN(
@@ -545,9 +548,6 @@ int main(int argc, char ** argv) {
                 if (i > 0) {
                     embd.erase(embd.begin(), embd.begin() + i);
                 }
-
-                // remove any "future" tokens that we might have inherited from the session from the KV cache
-                llama_kv_cache_tokens_rm(ctx, n_past, -1);
             }
 
             // evaluate tokens in batches