From 67c5e14d069fba61a424f6d782de2d49bf2a8722 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Sat, 6 Jul 2024 02:12:53 +0200
Subject: [PATCH 01/33] lora: load to devide buft

---
 common/common.cpp |  10 +-
 include/llama.h   |  13 +-
 src/llama.cpp     | 411 ++++++++++++++++++----------------------------
 3 files changed, 166 insertions(+), 268 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index c548bcb2857a8..d3eec6aa783b3 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2063,14 +2063,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
         float lora_scale = std::get<1>(params.lora_adapter[i]);
-        int err = llama_model_apply_lora_from_file(model,
-                                             lora_adapter.c_str(),
-                                             lora_scale,
-                                             ((i > 0) || params.lora_base.empty())
-                                                ? NULL
-                                                : params.lora_base.c_str(),
-                                             params.n_threads);
-        if (err != 0) {
+        auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str());
+        if (adapter == nullptr) {
             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
             llama_free(lctx);
             llama_free_model(model);
diff --git a/include/llama.h b/include/llama.h
index 865ace9944d02..077d902837c49 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -406,6 +406,9 @@ extern "C" {
         const char * content;
     } llama_chat_message;
 
+    // lora adapter
+    struct llama_lora_adapter;
+
     // Helpers for getting default parameters
     LLAMA_API struct llama_model_params llama_model_default_params(void);
     LLAMA_API struct llama_context_params llama_context_default_params(void);
@@ -510,13 +513,9 @@ extern "C" {
     // the layers modified by the adapter. Can be NULL to use the current loaded model.
     // The model needs to be reloaded before applying a new adapter, otherwise the adapter
     // will be applied on top of the previous one
-    // Returns 0 on success
-    LLAMA_API int32_t llama_model_apply_lora_from_file(
-            const struct llama_model * model,
-                          const char * path_lora,
-                               float   scale,
-                          const char * path_base_model,
-                             int32_t   n_threads);
+    LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
+            struct llama_context * ctx,
+            const char * path_lora);
 
     // Apply a loaded control vector to a llama_context, or if data is NULL, clear
     // the currently loaded vector.
diff --git a/src/llama.cpp b/src/llama.cpp
index b770ca5bc33fc..ec89b2778ea08 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2547,6 +2547,29 @@ struct llama_control_vector {
     }
 };
 
+struct lora_weight {
+    struct ggml_tensor * a = nullptr;
+    struct ggml_tensor * b = nullptr;
+    lora_weight() {}
+    lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
+};
+
+struct llama_lora_adapter {
+    // map tensor name to lora_a_b
+    std::map<std::string, lora_weight> ab_map;
+    std::vector<struct ggml_context *> ctxs;
+    std::vector<ggml_backend_buffer_t> bufs;
+
+    ~llama_lora_adapter() {
+        for (struct ggml_context * ctx : ctxs) {
+            ggml_free(ctx);
+        }
+        for (ggml_backend_buffer_t buf : bufs) {
+            ggml_backend_buffer_free(buf);
+        }
+    }
+};
+
 struct llama_vocab {
     using id    = int32_t;
     using token = std::string;
@@ -2704,6 +2727,10 @@ struct llama_context {
         }
 
         ggml_backend_buffer_free(buf_output);
+
+        for (auto adapter : lora_adapters) {
+            delete adapter;
+        }
     }
 
     llama_cparams cparams;
@@ -2795,6 +2822,9 @@ struct llama_context {
 
     // control vectors
     struct llama_control_vector cvec;
+
+    // lora adapters
+    std::vector<struct llama_lora_adapter *> lora_adapters;
 };
 
 static size_t llama_get_device_count(const llama_model & model) {
@@ -18243,281 +18273,149 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     }
 }
 
-static int llama_apply_lora_from_file_internal(
-    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
-) {
+static int llama_lora_adapter_init_internal(const struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
+    static const int n_inp_tensors = 5; // see llama_model
+    static const int n_out_tensors = 5; // see llama_model
     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
 
-    const int64_t t_start_lora_us = ggml_time_us();
-
-    llama_file fin(path_lora, "rb");
-
-    // verify magic and version
-    {
-        uint32_t magic = fin.read_u32();
-        if (magic != LLAMA_FILE_MAGIC_GGLA) {
-            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
-            return 1;
-        }
-
-        uint32_t format_version = fin.read_u32();
-        if (format_version != 1) {
-            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
-            return 1;
-        }
-    }
-
-    int32_t lora_r = fin.read_u32();
-    int32_t lora_alpha = fin.read_u32();
-    float scaling = scale * (float)lora_alpha / (float)lora_r;
+    // TODO: check lora base model arch
 
-    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
-
-    // load base model
-    std::unique_ptr<llama_model_loader> ml;
-    if (path_base_model) {
-        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
-        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
-        ml->init_mappings(/*prefetch*/ false); // no prefetching
-    }
-
-    struct tensor_meta {
-        std::string name;
-        ggml_type type;
-        int32_t ne[2];
-        size_t offset;
+    ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false,
+        /* .ctx      = */ &ctx,
     };
-    std::map<std::string, tensor_meta> tensor_meta_map;
-
-    // load all tensor meta
-    while (true) {
-        if (fin.tell() == fin.size) {
-            // eof
-            break;
-        }
-
-        int32_t n_dims;
-        int32_t name_len;
-        int32_t ftype;
-
-        fin.read_raw(&n_dims, sizeof(n_dims));
-        fin.read_raw(&name_len, sizeof(name_len));
-        fin.read_raw(&ftype, sizeof(ftype));
-
-        if (n_dims != 1 && n_dims != 2) {
-            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
-            return 1;
-        }
-
-        int32_t ne[2] = { 1, 1 };
-        for (int i = 0; i < n_dims; ++i) {
-            fin.read_raw(&ne[i], sizeof(ne[i]));
-        }
-
-        std::string name;
-        {
-            GGML_ASSERT(name_len < GGML_MAX_NAME);
-            char buf[GGML_MAX_NAME];
-            fin.read_raw(buf, name_len);
-            name = std::string(buf, name_len);
-        }
-
-        // check for lora suffix
-        std::string lora_suffix;
-        if (name.length() > 6) {
-            lora_suffix = name.substr(name.length() - 6);
-        }
-        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
-            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
-            return 1;
-        }
+    struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
+    if (!ctx_gguf) {
+        LLAMA_LOG_ERROR("%s: failed to load lora adapter file from %s\n", __func__, path_lora);
+        return -1;
+    }
 
-        // tensor type
-        ggml_type wtype;
-        switch (ftype) {
-            case 0: wtype = GGML_TYPE_F32;  break;
-            case 1: wtype = GGML_TYPE_F16;  break;
-            default:
-                    {
-                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
-                                __func__, ftype);
-                        return 1;
-                    }
+    // calculate n_tensors_per_layer
+    int n_tensors_per_layer = 0;
+    {
+        int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
+        for (int i = 0; i < n_tensors; i++) {
+            int il = -1;
+            sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il);
+            if (il == 0) n_tensors_per_layer++;
         }
-
-        // data offset
-        size_t offset = fin.tell();
-        offset = (offset + 31) & -32;
-
-        // skip tensor data
-        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
-
-        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
     }
+    printf("n_tensors_per_layer %d\n", n_tensors_per_layer);
 
-    bool warned = false;
-    int n_tensors = 0;
-
-    // apply
-    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
-    if (backend_cpu == nullptr) {
-        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
-        return 1;
+    // count layer buffer types
+    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
+    for (int64_t i = 0; i < model.hparams.n_layer; i++) {
+        buft_layer_count[model.buft_layer[i].buft]++;
     }
-    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
-
-    std::vector<no_init<uint8_t>> read_buf;
-    for (const auto & it : model.tensors_by_name) {
-        const std::string & base_name = it.first;
-        ggml_tensor * model_t = it.second;
-
-        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
-            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
-            continue;
-        }
-
-        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
-        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
 
-        ggml_init_params lora_init_params = {
-            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
-            /* .mem_buffer */ nullptr,
-            /* .no_alloc   */ true,
+    // allocate contexts
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    {
+        auto new_ggml_ctx = [](size_t n_tensors) {
+            struct ggml_init_params params = {
+                /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+            return ggml_init(params);
         };
-        ggml_context * lora_ctx = ggml_init(lora_init_params);
-        if (lora_ctx == nullptr) {
-            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
-            ggml_backend_free(backend_cpu);
-            return 1;
+        for (auto & it : buft_layer_count) {
+            int n_layers = it.second;
+            printf("buf %p layers %d\n", it.first, it.second);
+            ctx_map[it.first] = new_ggml_ctx(2*n_layers*n_tensors_per_layer);
         }
+        //ctx_map[model.buft_input.buft]  = new_ggml_ctx(2*n_inp_tensors);
+        //ctx_map[model.buft_output.buft] = new_ggml_ctx(2*n_out_tensors);
+    }
 
-        // create tensors
-        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
-        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
-        ggml_set_name(loraA, metaA.name.c_str());
-        ggml_set_name(loraB, metaB.name.c_str());
-
-        ggml_tensor * base_t;
-        if (ml) {
-            if (!ml->get_tensor_meta(base_name.c_str())) {
-                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
-                return 1;
+    // bundle lora_a and lora_b into pairs
+    std::map<std::string, lora_weight> ab_map;
+    auto str_endswith = [](const std::string & str, const std::string & suffix) {
+        return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
+    };
+    for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+        std::string name(cur->name);
+        if (str_endswith(name, ".lora_a")) {
+            replace_all(name, ".lora_a", "");
+            if (ab_map.find(name) == ab_map.end()) {
+                ab_map[name] = lora_weight(cur, nullptr);
+            } else {
+                ab_map[name].a = cur;
+            }
+        } else if (str_endswith(name, ".lora_b")) {
+            replace_all(name, ".lora_b", "");
+            if (ab_map.find(name) == ab_map.end()) {
+                ab_map[name] = lora_weight(nullptr, cur);
+            } else {
+                ab_map[name].b = cur;
             }
-            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
-        } else {
-            base_t = ggml_dup_tensor(lora_ctx, model_t);
-        }
-        ggml_set_name(base_t, base_name.c_str());
-
-        // allocate in backend buffer
-        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
-        if (lora_buf == nullptr) {
-            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
-            return 1;
         }
+    }
 
-        // load tensor data
-        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
-            read_buf.resize(ggml_nbytes(tensor));
-            fin.seek(tensor_meta.offset, SEEK_SET);
-            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
-            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
-        };
-        load_tensor(metaA, loraA);
-        load_tensor(metaB, loraB);
-
-        // load base model tensor data
-        if (ml) {
-            ml->load_data_for(base_t);
+    // add tensors
+    for (auto & it : ab_map) {
+        std::string name = it.first;
+        lora_weight & w = it.second;
+        GGML_ASSERT(w.a != nullptr);
+        GGML_ASSERT(w.b != nullptr);
+        int il = -1;
+        sscanf(name.c_str(), "blk.%d.", &il);
+        if (il >= 0) {
+            printf("%s %p %p\n", name.c_str(), w.a, w.b);
+            struct ggml_context * dev_ctx = ctx_map.at(model.buft_layer[il].buft);
+            struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
+            struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
+            ggml_set_name(tensor_a, w.a->name);
+            ggml_set_name(tensor_b, w.b->name);
+            adapter.ab_map[name] = lora_weight(tensor_a, tensor_b);
         } else {
-            ggml_backend_tensor_copy(model_t, base_t);
-        }
-
-        if (ggml_is_quantized(base_t->type) && !warned) {
-            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
-                            "use a f16 or f32 base model with --lora-base\n", __func__);
-            warned = true;
+            // TODO: process output & token_embd tensors
         }
+    }
 
-        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
-            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
-                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
-            ggml_free(lora_ctx);
-            ggml_backend_buffer_free(lora_buf);
-            ggml_backend_free(backend_cpu);
-            return 1;
-        }
-
-        auto build_lora_graph = [&]() {
-            // w = w + BA*s
-            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
-            ggml_set_name(BA, "BA");
-
-            if (scaling != 1.0f) {
-                BA = ggml_scale(lora_ctx, BA, scaling);
-                ggml_set_name(BA, "BA_scaled");
-            }
-
-            ggml_tensor * r;
-            r = ggml_add_inplace(lora_ctx, base_t, BA);
-            ggml_set_name(r, "r_add");
-
-            if (base_t->type != model_t->type) {
-                // convert the result to the model type
-                r = ggml_cast(lora_ctx, r, model_t->type);
-                ggml_set_name(r, "r_cast");
+    // allocate tensors / buffers and zero
+    {
+        adapter.ctxs.reserve(ctx_map.size());
+        adapter.bufs.reserve(ctx_map.size());
+        for (auto it : ctx_map) {
+            ggml_backend_buffer_type_t buft = it.first;
+            ggml_context * ctx = it.second;
+            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+            if (!buf) {
+                LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__);
+                return -1;
             }
-
-            return r;
-        };
-
-        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
-        ggml_tensor * r = build_lora_graph();
-        ggml_build_forward_expand(gf, r);
-
-        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
-        if (graph_buf == nullptr) {
-            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
-            ggml_free(lora_ctx);
-            ggml_backend_buffer_free(lora_buf);
-            ggml_backend_free(backend_cpu);
-            return 1;
+            ggml_backend_buffer_clear(buf, 0);
+            adapter.ctxs.push_back(ctx);
+            adapter.bufs.push_back(buf);
         }
+    }
 
-        ggml_backend_graph_compute(backend_cpu, gf);
-
-        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
-
-#if 0
-        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
-        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
-
-        // sched compute
-        ggml_build_forward_expand(gf, build_graph());
-        ggml_backend_sched_init_measure(sched, gf);
-
-        // create the graph again, since the previous one was destroyed by the measure
-        ggml_graph_clear(gf);
-        ggml_build_forward_expand(gf, build_graph());
-        ggml_backend_sched_graph_compute(sched, gf);
-        ggml_backend_sched_free(sched);
-#endif
-
-        ggml_backend_buffer_free(lora_buf);
-        ggml_backend_buffer_free(graph_buf);
-        ggml_free(lora_ctx);
-
-        n_tensors++;
-        if (n_tensors % 4 == 0) {
-            LLAMA_LOG_INFO(".");
+    // set tensor data
+    {
+        llama_file gguf_file(path_lora, "rb");
+        std::vector<uint8_t> read_buf;
+        auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
+            size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
+            size_t size = ggml_nbytes(orig);
+            if (read_buf.size() < size) {
+                read_buf.resize(size);
+            }
+            gguf_file.read_raw(read_buf.data(), size);
+            printf("%s: %s size=%ld\n", __func__, orig->name, size);
+            return ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
+        };
+        for (auto & it : adapter.ab_map) {
+            auto orig = ab_map[it.first];
+            auto dev  = it.second;
+            set_tensor(orig.a, dev.a);
+            set_tensor(orig.b, dev.b);
         }
     }
 
-    ggml_backend_free(backend_cpu);
-
-    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
-    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
-
+    // free ctx for reading gguf
+    ggml_free(ctx);
     return 0;
 }
 
@@ -19298,12 +19196,19 @@ uint32_t llama_model_quantize(
     }
 }
 
-int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
+struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora) {
     try {
-        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
+        struct llama_lora_adapter * adapter = new llama_lora_adapter;
+        int res = llama_lora_adapter_init_internal(ctx->model, path_lora, *adapter);
+        if (res == 0) {
+            ctx->lora_adapters.push_back(adapter);
+            return adapter;
+        } else {
+            return nullptr;
+        }
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
-        return 1;
+        return nullptr;
     }
 }
 

From e9d7b6c05f928665cb9779629816128b8016418d Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Sat, 6 Jul 2024 12:07:29 +0200
Subject: [PATCH 02/33] add patch tensor function

---
 src/llama.cpp | 211 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 193 insertions(+), 18 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index ec89b2778ea08..d97eb3bb2fc63 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2702,6 +2702,10 @@ struct llama_model {
     int64_t t_load_us = 0;
     int64_t t_start_us = 0;
 
+    // used by lora, to save model's original tensors
+    std::vector<struct ggml_tensor *> orig_tensors;
+    std::vector<llama_layer> orig_layers;
+
     ~llama_model() {
         for (struct ggml_context * ctx : ctxs) {
             ggml_free(ctx);
@@ -13491,6 +13495,10 @@ static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) {
     return result;
 }
 
+// forward declaration
+static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build);
+static int32_t llama_lora_restore_tensors(struct llama_context & lctx);
+
 static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
      const llama_batch & batch,
@@ -13534,6 +13542,11 @@ static struct ggml_cgraph * llama_build_graph(
 
     llm.init();
 
+    if (!lctx.lora_adapters.empty()) {
+        llama_lora_restore_tensors(lctx);
+        llama_lora_patch_tensors(lctx, llm.ctx0);
+    }
+
     switch (model.arch) {
         case LLM_ARCH_LLAMA:
             {
@@ -18304,10 +18317,12 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
     printf("n_tensors_per_layer %d\n", n_tensors_per_layer);
 
     // count layer buffer types
-    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
+    std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
     for (int64_t i = 0; i < model.hparams.n_layer; i++) {
-        buft_layer_count[model.buft_layer[i].buft]++;
+        buft_tensor_count[model.buft_layer[i].buft] += n_tensors_per_layer;
     }
+    buft_tensor_count[model.buft_input.buft]  += n_inp_tensors;
+    buft_tensor_count[model.buft_output.buft] += n_out_tensors;
 
     // allocate contexts
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -18320,13 +18335,11 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
             };
             return ggml_init(params);
         };
-        for (auto & it : buft_layer_count) {
-            int n_layers = it.second;
-            printf("buf %p layers %d\n", it.first, it.second);
-            ctx_map[it.first] = new_ggml_ctx(2*n_layers*n_tensors_per_layer);
+        for (auto & it : buft_tensor_count) {
+            int n_tensors = it.second;
+            // LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second);
+            ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors
         }
-        //ctx_map[model.buft_input.buft]  = new_ggml_ctx(2*n_inp_tensors);
-        //ctx_map[model.buft_output.buft] = new_ggml_ctx(2*n_out_tensors);
     }
 
     // bundle lora_a and lora_b into pairs
@@ -18356,22 +18369,29 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
     // add tensors
     for (auto & it : ab_map) {
         std::string name = it.first;
+        const char * cname = name.c_str();
         lora_weight & w = it.second;
         GGML_ASSERT(w.a != nullptr);
         GGML_ASSERT(w.b != nullptr);
         int il = -1;
-        sscanf(name.c_str(), "blk.%d.", &il);
+        sscanf(cname, "blk.%d.", &il);
+        struct ggml_context * dev_ctx; // device ctx
         if (il >= 0) {
-            printf("%s %p %p\n", name.c_str(), w.a, w.b);
-            struct ggml_context * dev_ctx = ctx_map.at(model.buft_layer[il].buft);
-            struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
-            struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
-            ggml_set_name(tensor_a, w.a->name);
-            ggml_set_name(tensor_b, w.b->name);
-            adapter.ab_map[name] = lora_weight(tensor_a, tensor_b);
+            dev_ctx = ctx_map.at(model.buft_layer[il].buft);
+        } else if (strstr(cname, "tok") == 0) {
+            dev_ctx = ctx_map.at(model.buft_input.buft);
+        } else if (strstr(cname, "output") == 0) {
+            dev_ctx = ctx_map.at(model.buft_output.buft);
         } else {
-            // TODO: process output & token_embd tensors
+            LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname);
+            continue;
         }
+        // LLAMA_LOG_INFO("%s %p %p\n", cname, w.a, w.b);
+        struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
+        struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
+        ggml_set_name(tensor_a, w.a->name);
+        ggml_set_name(tensor_b, w.b->name);
+        adapter.ab_map[name] = lora_weight(tensor_a, tensor_b);
     }
 
     // allocate tensors / buffers and zero
@@ -18402,8 +18422,9 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
             if (read_buf.size() < size) {
                 read_buf.resize(size);
             }
+            gguf_file.seek(offs, SEEK_SET);
             gguf_file.read_raw(read_buf.data(), size);
-            printf("%s: %s size=%ld\n", __func__, orig->name, size);
+            // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, orig->name, size);
             return ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
         };
         for (auto & it : adapter.ab_map) {
@@ -18414,11 +18435,165 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
         }
     }
 
+    LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);
+
     // free ctx for reading gguf
     ggml_free(ctx);
     return 0;
 }
 
+static int32_t llama_lora_restore_tensors(struct llama_context & lctx) {
+    // TODO @ngxson : not ideal, but "const" is discarded to make it work
+    struct llama_model & model = const_cast<struct llama_model &>(lctx.model);
+    if (!model.orig_tensors.empty()) {
+        size_t i = 0;
+        model.tok_embd        = model.orig_tensors[i++];
+        model.type_embd       = model.orig_tensors[i++];
+        model.pos_embd        = model.orig_tensors[i++];
+        model.tok_norm        = model.orig_tensors[i++];
+        model.tok_norm_b      = model.orig_tensors[i++];
+        model.output_norm     = model.orig_tensors[i++];
+        model.output_norm_b   = model.orig_tensors[i++];
+        model.output          = model.orig_tensors[i++];
+        model.output_b        = model.orig_tensors[i++];
+        model.output_norm_enc = model.orig_tensors[i++];
+        for (size_t il = 0; il < model.orig_layers.size(); il++) {
+            model.layers[il] = model.orig_layers[il]; // copy
+        }
+    }
+}
+
+static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build) {
+    GGML_ASSERT(!lctx.lora_adapters.empty());
+    // TODO @ngxson : not ideal, but "const" is discarded to make it work
+    struct llama_model & model = const_cast<struct llama_model &>(lctx.model);
+
+    // save all original tensors
+    if (model.orig_tensors.empty()) {
+        model.orig_tensors.push_back(model.tok_embd);
+        model.orig_tensors.push_back(model.type_embd);
+        model.orig_tensors.push_back(model.pos_embd);
+        model.orig_tensors.push_back(model.tok_norm);
+        model.orig_tensors.push_back(model.tok_norm_b);
+        model.orig_tensors.push_back(model.output_norm);
+        model.orig_tensors.push_back(model.output_norm_b);
+        model.orig_tensors.push_back(model.output);
+        model.orig_tensors.push_back(model.output_b);
+        model.orig_tensors.push_back(model.output_norm_enc);
+        model.orig_layers.reserve(model.layers.size());
+        for (llama_layer layer : model.layers) {
+            model.orig_layers.push_back(layer); // copy
+        }
+    }
+
+    // patch tensors
+    auto patch_tensor = [&](struct llama_lora_adapter * adapter, struct ggml_tensor ** tensor) {
+        if (*tensor == nullptr) {
+            return;
+        }
+        std::string name = ggml_get_name(*tensor);
+        if (adapter->ab_map.find(name) != adapter->ab_map.end()) {
+            auto lora_w = adapter->ab_map[name];
+            struct ggml_tensor * cur = ggml_mul_mat(ctx_build, lora_w.a, lora_w.b);
+            cur = ggml_add(ctx_build, cur, *tensor);
+            // TODO: scale
+            ggml_format_name(cur, "%s.merged", name.c_str());
+            // LLAMA_LOG_INFO("LORA %s\n", cur->name);
+            tensor = &cur;
+        }
+    };
+    for (auto adapter : lctx.lora_adapters) {
+        patch_tensor(adapter, &model.tok_embd);
+        patch_tensor(adapter, &model.type_embd);
+        patch_tensor(adapter, &model.pos_embd);
+        patch_tensor(adapter, &model.tok_norm);
+        patch_tensor(adapter, &model.tok_norm_b);
+        patch_tensor(adapter, &model.output_norm);
+        patch_tensor(adapter, &model.output_norm_b);
+        patch_tensor(adapter, &model.output);
+        patch_tensor(adapter, &model.output_b);
+        patch_tensor(adapter, &model.output_norm_enc);
+        for (llama_layer & layer : model.layers) {
+            patch_tensor(adapter, &layer.attn_norm);
+            patch_tensor(adapter, &layer.attn_norm_b);
+            patch_tensor(adapter, &layer.attn_norm_2);
+            patch_tensor(adapter, &layer.attn_norm_2_b);
+            patch_tensor(adapter, &layer.attn_q_norm);
+            patch_tensor(adapter, &layer.attn_q_norm_b);
+            patch_tensor(adapter, &layer.attn_k_norm);
+            patch_tensor(adapter, &layer.attn_k_norm_b);
+            patch_tensor(adapter, &layer.attn_out_norm);
+            patch_tensor(adapter, &layer.attn_out_norm_b);
+            patch_tensor(adapter, &layer.attn_q_a_norm);
+            patch_tensor(adapter, &layer.attn_kv_a_norm);
+            patch_tensor(adapter, &layer.attn_sub_norm);
+            patch_tensor(adapter, &layer.attn_post_norm);
+            patch_tensor(adapter, &layer.ffn_sub_norm);
+            patch_tensor(adapter, &layer.attn_norm_cross);
+            patch_tensor(adapter, &layer.attn_norm_enc);
+
+            patch_tensor(adapter, &layer.wq);
+            patch_tensor(adapter, &layer.wk);
+            patch_tensor(adapter, &layer.wv);
+            patch_tensor(adapter, &layer.wo);
+            patch_tensor(adapter, &layer.wqkv);
+            patch_tensor(adapter, &layer.wq_a);
+            patch_tensor(adapter, &layer.wq_b);
+            patch_tensor(adapter, &layer.wkv_a_mqa);
+            patch_tensor(adapter, &layer.wkv_b);
+            patch_tensor(adapter, &layer.wq_cross);
+            patch_tensor(adapter, &layer.wk_cross);
+            patch_tensor(adapter, &layer.wv_cross);
+            patch_tensor(adapter, &layer.wo_cross);
+            patch_tensor(adapter, &layer.wq_enc);
+            patch_tensor(adapter, &layer.wk_enc);
+            patch_tensor(adapter, &layer.wv_enc);
+            patch_tensor(adapter, &layer.wo_enc);
+
+            patch_tensor(adapter, &layer.bq);
+            patch_tensor(adapter, &layer.bk);
+            patch_tensor(adapter, &layer.bv);
+            patch_tensor(adapter, &layer.bo);
+            patch_tensor(adapter, &layer.bqkv);
+
+            patch_tensor(adapter, &layer.attn_rel_b);
+            patch_tensor(adapter, &layer.attn_rel_b_enc);
+            patch_tensor(adapter, &layer.attn_rel_b_cross);
+
+            patch_tensor(adapter, &layer.ffn_norm);
+            patch_tensor(adapter, &layer.ffn_norm_b);
+            patch_tensor(adapter, &layer.ffn_post_norm);
+            patch_tensor(adapter, &layer.layer_out_norm);
+            patch_tensor(adapter, &layer.layer_out_norm_b);
+            patch_tensor(adapter, &layer.ffn_norm_exps);
+            patch_tensor(adapter, &layer.ffn_norm_enc);
+
+            patch_tensor(adapter, &layer.ffn_gate);
+            patch_tensor(adapter, &layer.ffn_down);
+            patch_tensor(adapter, &layer.ffn_up);
+            patch_tensor(adapter, &layer.ffn_gate_enc);
+            patch_tensor(adapter, &layer.ffn_down_enc);
+            patch_tensor(adapter, &layer.ffn_up_enc);
+
+            patch_tensor(adapter, &layer.ffn_gate_inp);
+            patch_tensor(adapter, &layer.ffn_gate_exps);
+            patch_tensor(adapter, &layer.ffn_down_exps);
+            patch_tensor(adapter, &layer.ffn_up_exps );
+
+            patch_tensor(adapter, &layer.ffn_gate_inp_shexp);
+            patch_tensor(adapter, &layer.ffn_gate_shexp);
+            patch_tensor(adapter, &layer.ffn_down_shexp);
+            patch_tensor(adapter, &layer.ffn_up_shexp);
+
+            patch_tensor(adapter, &layer.ffn_gate_b);
+            patch_tensor(adapter, &layer.ffn_down_b);
+            patch_tensor(adapter, &layer.ffn_up_b);
+            patch_tensor(adapter, &layer.ffn_act);
+        }
+    }
+    return 0;
+}
+
 //
 // interface implementation
 //

From 4e28ad40a099c7f618abf8ae113c4e56ee7705e8 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Sat, 6 Jul 2024 13:29:37 +0200
Subject: [PATCH 03/33] correct tensor patch

---
 ggml/src/ggml.c |  4 ++--
 src/llama.cpp   | 33 ++++++++++++---------------------
 2 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index bc91ac3a726ab..2093be2a98013 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
 
     fprintf(fp, "digraph G {\n");
     fprintf(fp, "  newrank = true;\n");
-    fprintf(fp, "  rankdir = LR;\n");
+    fprintf(fp, "  rankdir = TB;\n");
 
     for (int i = 0; i < gb->n_nodes; i++) {
         struct ggml_tensor * node = gb->nodes[i];
@@ -19401,7 +19401,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
         }
 
         fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
-        if (ggml_nelements(node) < 5) {
+        if (ggml_nelements(node) < 5 && node->data != NULL) {
             fprintf(fp, " | (");
             for (int j = 0; j < ggml_nelements(node); j++) {
                 if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
diff --git a/src/llama.cpp b/src/llama.cpp
index d97eb3bb2fc63..1c7f6650a9c47 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18314,7 +18314,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
             if (il == 0) n_tensors_per_layer++;
         }
     }
-    printf("n_tensors_per_layer %d\n", n_tensors_per_layer);
+    // printf("n_tensors_per_layer %d\n", n_tensors_per_layer);
 
     // count layer buffer types
     std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
@@ -18363,6 +18363,8 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
             } else {
                 ab_map[name].b = cur;
             }
+        } else {
+            LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name);
         }
     }
 
@@ -18400,14 +18402,14 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
         adapter.bufs.reserve(ctx_map.size());
         for (auto it : ctx_map) {
             ggml_backend_buffer_type_t buft = it.first;
-            ggml_context * ctx = it.second;
-            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+            ggml_context * ctx_dev = it.second;
+            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft);
             if (!buf) {
                 LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__);
                 return -1;
             }
             ggml_backend_buffer_clear(buf, 0);
-            adapter.ctxs.push_back(ctx);
+            adapter.ctxs.push_back(ctx_dev);
             adapter.bufs.push_back(buf);
         }
     }
@@ -18424,8 +18426,8 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
             }
             gguf_file.seek(offs, SEEK_SET);
             gguf_file.read_raw(read_buf.data(), size);
-            // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, orig->name, size);
-            return ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
+            // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size);
+            ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
         };
         for (auto & it : adapter.ab_map) {
             auto orig = ab_map[it.first];
@@ -18461,6 +18463,7 @@ static int32_t llama_lora_restore_tensors(struct llama_context & lctx) {
             model.layers[il] = model.orig_layers[il]; // copy
         }
     }
+    return 0;
 }
 
 static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build) {
@@ -18498,8 +18501,8 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml
             cur = ggml_add(ctx_build, cur, *tensor);
             // TODO: scale
             ggml_format_name(cur, "%s.merged", name.c_str());
-            // LLAMA_LOG_INFO("LORA %s\n", cur->name);
-            tensor = &cur;
+            // LLAMA_LOG_INFO("LORA %p %s\n", cur, cur->name);
+            *tensor = cur;
         }
     };
     for (auto adapter : lctx.lora_adapters) {
@@ -18541,14 +18544,6 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml
             patch_tensor(adapter, &layer.wq_b);
             patch_tensor(adapter, &layer.wkv_a_mqa);
             patch_tensor(adapter, &layer.wkv_b);
-            patch_tensor(adapter, &layer.wq_cross);
-            patch_tensor(adapter, &layer.wk_cross);
-            patch_tensor(adapter, &layer.wv_cross);
-            patch_tensor(adapter, &layer.wo_cross);
-            patch_tensor(adapter, &layer.wq_enc);
-            patch_tensor(adapter, &layer.wk_enc);
-            patch_tensor(adapter, &layer.wv_enc);
-            patch_tensor(adapter, &layer.wo_enc);
 
             patch_tensor(adapter, &layer.bq);
             patch_tensor(adapter, &layer.bk);
@@ -18556,10 +18551,6 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml
             patch_tensor(adapter, &layer.bo);
             patch_tensor(adapter, &layer.bqkv);
 
-            patch_tensor(adapter, &layer.attn_rel_b);
-            patch_tensor(adapter, &layer.attn_rel_b_enc);
-            patch_tensor(adapter, &layer.attn_rel_b_cross);
-
             patch_tensor(adapter, &layer.ffn_norm);
             patch_tensor(adapter, &layer.ffn_norm_b);
             patch_tensor(adapter, &layer.ffn_post_norm);
@@ -18578,7 +18569,7 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml
             patch_tensor(adapter, &layer.ffn_gate_inp);
             patch_tensor(adapter, &layer.ffn_gate_exps);
             patch_tensor(adapter, &layer.ffn_down_exps);
-            patch_tensor(adapter, &layer.ffn_up_exps );
+            patch_tensor(adapter, &layer.ffn_up_exps);
 
             patch_tensor(adapter, &layer.ffn_gate_inp_shexp);
             patch_tensor(adapter, &layer.ffn_gate_shexp);

From 1b4ffbac4720cd9bee0bc0422df927a1ff1dc22f Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Sat, 6 Jul 2024 14:24:56 +0200
Subject: [PATCH 04/33] llama_lora_adapter_apply

---
 common/common.cpp |   3 +-
 ggml/src/ggml.c   |   2 +-
 include/llama.h   |   6 +-
 src/llama.cpp     | 253 +++++++++++++++++++++-------------------------
 4 files changed, 122 insertions(+), 142 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index d3eec6aa783b3..d5dd4d38d3cf0 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2063,13 +2063,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
         float lora_scale = std::get<1>(params.lora_adapter[i]);
-        auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str());
+        auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str(), lora_scale);
         if (adapter == nullptr) {
             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
             llama_free(lctx);
             llama_free_model(model);
             return std::make_tuple(nullptr, nullptr);
         }
+        llama_lora_adapter_apply(lctx, adapter);
     }
 
     if (params.ignore_eos) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 2093be2a98013..2e09b7087e667 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
 
     fprintf(fp, "digraph G {\n");
     fprintf(fp, "  newrank = true;\n");
-    fprintf(fp, "  rankdir = TB;\n");
+    fprintf(fp, "  rankdir = LR;\n");
 
     for (int i = 0; i < gb->n_nodes; i++) {
         struct ggml_tensor * node = gb->nodes[i];
diff --git a/include/llama.h b/include/llama.h
index 077d902837c49..50ea0d84773bf 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -515,7 +515,11 @@ extern "C" {
     // will be applied on top of the previous one
     LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
             struct llama_context * ctx,
-            const char * path_lora);
+            const char * path_lora,
+            float scale);
+    LLAMA_API int32_t llama_lora_adapter_apply(
+            struct llama_context * ctx,
+            struct llama_lora_adapter * adapter);
 
     // Apply a loaded control vector to a llama_context, or if data is NULL, clear
     // the currently loaded vector.
diff --git a/src/llama.cpp b/src/llama.cpp
index 1c7f6650a9c47..de3d77485c0c2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2559,6 +2559,7 @@ struct llama_lora_adapter {
     std::map<std::string, lora_weight> ab_map;
     std::vector<struct ggml_context *> ctxs;
     std::vector<ggml_backend_buffer_t> bufs;
+    float scale = 1.0f;
 
     ~llama_lora_adapter() {
         for (struct ggml_context * ctx : ctxs) {
@@ -13495,10 +13496,6 @@ static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) {
     return result;
 }
 
-// forward declaration
-static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build);
-static int32_t llama_lora_restore_tensors(struct llama_context & lctx);
-
 static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
      const llama_batch & batch,
@@ -13542,11 +13539,6 @@ static struct ggml_cgraph * llama_build_graph(
 
     llm.init();
 
-    if (!lctx.lora_adapters.empty()) {
-        llama_lora_restore_tensors(lctx);
-        llama_lora_patch_tensors(lctx, llm.ctx0);
-    }
-
     switch (model.arch) {
         case LLM_ARCH_LLAMA:
             {
@@ -18444,144 +18436,126 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
     return 0;
 }
 
-static int32_t llama_lora_restore_tensors(struct llama_context & lctx) {
-    // TODO @ngxson : not ideal, but "const" is discarded to make it work
-    struct llama_model & model = const_cast<struct llama_model &>(lctx.model);
-    if (!model.orig_tensors.empty()) {
-        size_t i = 0;
-        model.tok_embd        = model.orig_tensors[i++];
-        model.type_embd       = model.orig_tensors[i++];
-        model.pos_embd        = model.orig_tensors[i++];
-        model.tok_norm        = model.orig_tensors[i++];
-        model.tok_norm_b      = model.orig_tensors[i++];
-        model.output_norm     = model.orig_tensors[i++];
-        model.output_norm_b   = model.orig_tensors[i++];
-        model.output          = model.orig_tensors[i++];
-        model.output_b        = model.orig_tensors[i++];
-        model.output_norm_enc = model.orig_tensors[i++];
-        for (size_t il = 0; il < model.orig_layers.size(); il++) {
-            model.layers[il] = model.orig_layers[il]; // copy
-        }
-    }
-    return 0;
-}
+int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_adapter * adapter) {
+    GGML_ASSERT(!lctx->lora_adapters.empty());
+    const struct llama_model & model = lctx->model;
+    struct ggml_init_params ctx0_params = {
+        /*.mem_size   =*/ lctx->buf_compute_meta.size(),
+        /*.mem_buffer =*/ lctx->buf_compute_meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+    struct ggml_context * ctx0 = ggml_init(ctx0_params);
 
-static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build) {
-    GGML_ASSERT(!lctx.lora_adapters.empty());
-    // TODO @ngxson : not ideal, but "const" is discarded to make it work
-    struct llama_model & model = const_cast<struct llama_model &>(lctx.model);
-
-    // save all original tensors
-    if (model.orig_tensors.empty()) {
-        model.orig_tensors.push_back(model.tok_embd);
-        model.orig_tensors.push_back(model.type_embd);
-        model.orig_tensors.push_back(model.pos_embd);
-        model.orig_tensors.push_back(model.tok_norm);
-        model.orig_tensors.push_back(model.tok_norm_b);
-        model.orig_tensors.push_back(model.output_norm);
-        model.orig_tensors.push_back(model.output_norm_b);
-        model.orig_tensors.push_back(model.output);
-        model.orig_tensors.push_back(model.output_b);
-        model.orig_tensors.push_back(model.output_norm_enc);
-        model.orig_layers.reserve(model.layers.size());
-        for (llama_layer layer : model.layers) {
-            model.orig_layers.push_back(layer); // copy
-        }
-    }
-
-    // patch tensors
-    auto patch_tensor = [&](struct llama_lora_adapter * adapter, struct ggml_tensor ** tensor) {
-        if (*tensor == nullptr) {
+    // apply lora for model tensors
+    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+    std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> output_nodes;
+    auto apply_lora = [&](struct llama_lora_adapter * adapter, struct ggml_tensor * model_tensor) {
+        if (model_tensor == nullptr) {
             return;
         }
-        std::string name = ggml_get_name(*tensor);
+        std::string name = ggml_get_name(model_tensor);
         if (adapter->ab_map.find(name) != adapter->ab_map.end()) {
             auto lora_w = adapter->ab_map[name];
-            struct ggml_tensor * cur = ggml_mul_mat(ctx_build, lora_w.a, lora_w.b);
-            cur = ggml_add(ctx_build, cur, *tensor);
-            // TODO: scale
+            struct ggml_tensor * cur = ggml_mul_mat(ctx0, lora_w.a, lora_w.b);
+            cur = ggml_scale_inplace(ctx0, cur, adapter->scale);
+            cur = ggml_add(ctx0, cur, model_tensor);
             ggml_format_name(cur, "%s.merged", name.c_str());
-            // LLAMA_LOG_INFO("LORA %p %s\n", cur, cur->name);
-            *tensor = cur;
+            ggml_build_forward_expand(gf, cur);
+            output_nodes.push_back({model_tensor, cur});
         }
     };
-    for (auto adapter : lctx.lora_adapters) {
-        patch_tensor(adapter, &model.tok_embd);
-        patch_tensor(adapter, &model.type_embd);
-        patch_tensor(adapter, &model.pos_embd);
-        patch_tensor(adapter, &model.tok_norm);
-        patch_tensor(adapter, &model.tok_norm_b);
-        patch_tensor(adapter, &model.output_norm);
-        patch_tensor(adapter, &model.output_norm_b);
-        patch_tensor(adapter, &model.output);
-        patch_tensor(adapter, &model.output_b);
-        patch_tensor(adapter, &model.output_norm_enc);
-        for (llama_layer & layer : model.layers) {
-            patch_tensor(adapter, &layer.attn_norm);
-            patch_tensor(adapter, &layer.attn_norm_b);
-            patch_tensor(adapter, &layer.attn_norm_2);
-            patch_tensor(adapter, &layer.attn_norm_2_b);
-            patch_tensor(adapter, &layer.attn_q_norm);
-            patch_tensor(adapter, &layer.attn_q_norm_b);
-            patch_tensor(adapter, &layer.attn_k_norm);
-            patch_tensor(adapter, &layer.attn_k_norm_b);
-            patch_tensor(adapter, &layer.attn_out_norm);
-            patch_tensor(adapter, &layer.attn_out_norm_b);
-            patch_tensor(adapter, &layer.attn_q_a_norm);
-            patch_tensor(adapter, &layer.attn_kv_a_norm);
-            patch_tensor(adapter, &layer.attn_sub_norm);
-            patch_tensor(adapter, &layer.attn_post_norm);
-            patch_tensor(adapter, &layer.ffn_sub_norm);
-            patch_tensor(adapter, &layer.attn_norm_cross);
-            patch_tensor(adapter, &layer.attn_norm_enc);
-
-            patch_tensor(adapter, &layer.wq);
-            patch_tensor(adapter, &layer.wk);
-            patch_tensor(adapter, &layer.wv);
-            patch_tensor(adapter, &layer.wo);
-            patch_tensor(adapter, &layer.wqkv);
-            patch_tensor(adapter, &layer.wq_a);
-            patch_tensor(adapter, &layer.wq_b);
-            patch_tensor(adapter, &layer.wkv_a_mqa);
-            patch_tensor(adapter, &layer.wkv_b);
-
-            patch_tensor(adapter, &layer.bq);
-            patch_tensor(adapter, &layer.bk);
-            patch_tensor(adapter, &layer.bv);
-            patch_tensor(adapter, &layer.bo);
-            patch_tensor(adapter, &layer.bqkv);
-
-            patch_tensor(adapter, &layer.ffn_norm);
-            patch_tensor(adapter, &layer.ffn_norm_b);
-            patch_tensor(adapter, &layer.ffn_post_norm);
-            patch_tensor(adapter, &layer.layer_out_norm);
-            patch_tensor(adapter, &layer.layer_out_norm_b);
-            patch_tensor(adapter, &layer.ffn_norm_exps);
-            patch_tensor(adapter, &layer.ffn_norm_enc);
-
-            patch_tensor(adapter, &layer.ffn_gate);
-            patch_tensor(adapter, &layer.ffn_down);
-            patch_tensor(adapter, &layer.ffn_up);
-            patch_tensor(adapter, &layer.ffn_gate_enc);
-            patch_tensor(adapter, &layer.ffn_down_enc);
-            patch_tensor(adapter, &layer.ffn_up_enc);
-
-            patch_tensor(adapter, &layer.ffn_gate_inp);
-            patch_tensor(adapter, &layer.ffn_gate_exps);
-            patch_tensor(adapter, &layer.ffn_down_exps);
-            patch_tensor(adapter, &layer.ffn_up_exps);
-
-            patch_tensor(adapter, &layer.ffn_gate_inp_shexp);
-            patch_tensor(adapter, &layer.ffn_gate_shexp);
-            patch_tensor(adapter, &layer.ffn_down_shexp);
-            patch_tensor(adapter, &layer.ffn_up_shexp);
-
-            patch_tensor(adapter, &layer.ffn_gate_b);
-            patch_tensor(adapter, &layer.ffn_down_b);
-            patch_tensor(adapter, &layer.ffn_up_b);
-            patch_tensor(adapter, &layer.ffn_act);
-        }
+    apply_lora(adapter, model.tok_embd);
+    apply_lora(adapter, model.type_embd);
+    apply_lora(adapter, model.pos_embd);
+    apply_lora(adapter, model.tok_norm);
+    apply_lora(adapter, model.tok_norm_b);
+    apply_lora(adapter, model.output_norm);
+    apply_lora(adapter, model.output_norm_b);
+    apply_lora(adapter, model.output);
+    apply_lora(adapter, model.output_b);
+    apply_lora(adapter, model.output_norm_enc);
+    for (const llama_layer & layer : model.layers) {
+        apply_lora(adapter, layer.attn_norm);
+        apply_lora(adapter, layer.attn_norm_b);
+        apply_lora(adapter, layer.attn_norm_2);
+        apply_lora(adapter, layer.attn_norm_2_b);
+        apply_lora(adapter, layer.attn_q_norm);
+        apply_lora(adapter, layer.attn_q_norm_b);
+        apply_lora(adapter, layer.attn_k_norm);
+        apply_lora(adapter, layer.attn_k_norm_b);
+        apply_lora(adapter, layer.attn_out_norm);
+        apply_lora(adapter, layer.attn_out_norm_b);
+        apply_lora(adapter, layer.attn_q_a_norm);
+        apply_lora(adapter, layer.attn_kv_a_norm);
+        apply_lora(adapter, layer.attn_sub_norm);
+        apply_lora(adapter, layer.attn_post_norm);
+        apply_lora(adapter, layer.ffn_sub_norm);
+        apply_lora(adapter, layer.attn_norm_cross);
+        apply_lora(adapter, layer.attn_norm_enc);
+
+        apply_lora(adapter, layer.wq);
+        apply_lora(adapter, layer.wk);
+        apply_lora(adapter, layer.wv);
+        apply_lora(adapter, layer.wo);
+        apply_lora(adapter, layer.wqkv);
+        apply_lora(adapter, layer.wq_a);
+        apply_lora(adapter, layer.wq_b);
+        apply_lora(adapter, layer.wkv_a_mqa);
+        apply_lora(adapter, layer.wkv_b);
+
+        apply_lora(adapter, layer.bq);
+        apply_lora(adapter, layer.bk);
+        apply_lora(adapter, layer.bv);
+        apply_lora(adapter, layer.bo);
+        apply_lora(adapter, layer.bqkv);
+
+        apply_lora(adapter, layer.ffn_norm);
+        apply_lora(adapter, layer.ffn_norm_b);
+        apply_lora(adapter, layer.ffn_post_norm);
+        apply_lora(adapter, layer.layer_out_norm);
+        apply_lora(adapter, layer.layer_out_norm_b);
+        apply_lora(adapter, layer.ffn_norm_exps);
+        apply_lora(adapter, layer.ffn_norm_enc);
+
+        apply_lora(adapter, layer.ffn_gate);
+        apply_lora(adapter, layer.ffn_down);
+        apply_lora(adapter, layer.ffn_up);
+        apply_lora(adapter, layer.ffn_gate_enc);
+        apply_lora(adapter, layer.ffn_down_enc);
+        apply_lora(adapter, layer.ffn_up_enc);
+
+        apply_lora(adapter, layer.ffn_gate_inp);
+        apply_lora(adapter, layer.ffn_gate_exps);
+        apply_lora(adapter, layer.ffn_down_exps);
+        apply_lora(adapter, layer.ffn_up_exps);
+
+        apply_lora(adapter, layer.ffn_gate_inp_shexp);
+        apply_lora(adapter, layer.ffn_gate_shexp);
+        apply_lora(adapter, layer.ffn_down_shexp);
+        apply_lora(adapter, layer.ffn_up_shexp);
+
+        apply_lora(adapter, layer.ffn_gate_b);
+        apply_lora(adapter, layer.ffn_down_b);
+        apply_lora(adapter, layer.ffn_up_b);
+        apply_lora(adapter, layer.ffn_act);
+    }
+
+    // merge lora to model weight
+    ggml_status res = ggml_backend_sched_graph_compute(lctx->sched, gf);
+    if (res == GGML_STATUS_SUCCESS) {
+        for (auto & out : output_nodes) {
+            struct ggml_tensor * model_tensor  = out.first;
+            struct ggml_tensor * merged_tensor = out.second;
+            ggml_backend_tensor_copy(merged_tensor, model_tensor);
+            ggml_set_name(model_tensor, merged_tensor->name);
+        }
+        LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, output_nodes.size());
+    } else {
+        LLAMA_LOG_ERROR("%s: compute error while merging lora weights to model, result = %d\n", __func__, res);
+        return res;
     }
+
+    ggml_free(ctx0);
     return 0;
 }
 
@@ -19362,9 +19336,10 @@ uint32_t llama_model_quantize(
     }
 }
 
-struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora) {
+struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora, float scale) {
     try {
         struct llama_lora_adapter * adapter = new llama_lora_adapter;
+        adapter->scale = scale;
         int res = llama_lora_adapter_init_internal(ctx->model, path_lora, *adapter);
         if (res == 0) {
             ctx->lora_adapters.push_back(adapter);

From b88ce0f8927427929e25f45a419623a55ca043f4 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Sat, 6 Jul 2024 15:06:32 +0200
Subject: [PATCH 05/33] correct ggml_backend_tensor_copy

---
 src/llama.cpp | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index de3d77485c0c2..5f02106d366a2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18446,9 +18446,10 @@ int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_
     };
     struct ggml_context * ctx0 = ggml_init(ctx0_params);
 
+    // map "merged.%s" name to model tensor
+    std::map<std::string, struct ggml_tensor *> output_map;
     // apply lora for model tensors
     struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-    std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> output_nodes;
     auto apply_lora = [&](struct llama_lora_adapter * adapter, struct ggml_tensor * model_tensor) {
         if (model_tensor == nullptr) {
             return;
@@ -18459,9 +18460,9 @@ int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_
             struct ggml_tensor * cur = ggml_mul_mat(ctx0, lora_w.a, lora_w.b);
             cur = ggml_scale_inplace(ctx0, cur, adapter->scale);
             cur = ggml_add(ctx0, cur, model_tensor);
-            ggml_format_name(cur, "%s.merged", name.c_str());
+            ggml_format_name(cur, "merged.%s", name.c_str());
             ggml_build_forward_expand(gf, cur);
-            output_nodes.push_back({model_tensor, cur});
+            output_map[std::string(cur->name)] = model_tensor;
         }
     };
     apply_lora(adapter, model.tok_embd);
@@ -18543,13 +18544,19 @@ int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_
     // merge lora to model weight
     ggml_status res = ggml_backend_sched_graph_compute(lctx->sched, gf);
     if (res == GGML_STATUS_SUCCESS) {
-        for (auto & out : output_nodes) {
-            struct ggml_tensor * model_tensor  = out.first;
-            struct ggml_tensor * merged_tensor = out.second;
-            ggml_backend_tensor_copy(merged_tensor, model_tensor);
-            ggml_set_name(model_tensor, merged_tensor->name);
-        }
-        LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, output_nodes.size());
+        // graph maybe realloc, we need to find correct gf->nodes based on name
+        size_t n_merged = 0;
+        for (int i = 0; i < gf->n_nodes; ++i) {
+            auto node = gf->nodes[i];
+            std::string name(node->name);
+            if (output_map.find(name) != output_map.end()) {
+                struct ggml_tensor * model_tensor = output_map[name];
+                ggml_backend_tensor_copy(node, model_tensor);
+                n_merged++;
+            }
+        }
+        GGML_ASSERT(n_merged == output_map.size());
+        LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, n_merged);
     } else {
         LLAMA_LOG_ERROR("%s: compute error while merging lora weights to model, result = %d\n", __func__, res);
         return res;

From f6d090d7de2544be6a508d53630e791d9ce0751f Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Sun, 7 Jul 2024 16:01:05 +0200
Subject: [PATCH 06/33] add llm_build_mm

---
 common/common.cpp |   4 +-
 ggml/src/ggml.c   |   2 +-
 include/llama.h   |  24 ++-
 src/llama.cpp     | 467 ++++++++++++++++++++--------------------------
 4 files changed, 220 insertions(+), 277 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index d5dd4d38d3cf0..ec5709f83fd5e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2063,14 +2063,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
         float lora_scale = std::get<1>(params.lora_adapter[i]);
-        auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str(), lora_scale);
+        auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
         if (adapter == nullptr) {
             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
             llama_free(lctx);
             llama_free_model(model);
             return std::make_tuple(nullptr, nullptr);
         }
-        llama_lora_adapter_apply(lctx, adapter);
+        llama_lora_adapter_set(lctx, adapter, lora_scale);
     }
 
     if (params.ignore_eos) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 2e09b7087e667..2093be2a98013 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
 
     fprintf(fp, "digraph G {\n");
     fprintf(fp, "  newrank = true;\n");
-    fprintf(fp, "  rankdir = LR;\n");
+    fprintf(fp, "  rankdir = TB;\n");
 
     for (int i = 0; i < gb->n_nodes; i++) {
         struct ggml_tensor * node = gb->nodes[i];
diff --git a/include/llama.h b/include/llama.h
index 50ea0d84773bf..37140b7714788 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -508,19 +508,29 @@ extern "C" {
             const char * fname_out,
             const llama_model_quantize_params * params);
 
-    // Apply a LoRA adapter to a loaded model
-    // path_base_model is the path to a higher quality model to use as a base for
-    // the layers modified by the adapter. Can be NULL to use the current loaded model.
-    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
-    // will be applied on top of the previous one
+    // Load a LoRA adapter from file
+    // The loaded adapter will be associated to the given model, and will be free when the model is deleted
     LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
+            struct llama_model * model,
+            const char * path_lora);
+
+    // Add a loaded LoRA adapter to given context
+    // This will not modify model's weight
+    LLAMA_API int32_t llama_lora_adapter_set(
             struct llama_context * ctx,
-            const char * path_lora,
+            struct llama_lora_adapter * adapter,
             float scale);
-    LLAMA_API int32_t llama_lora_adapter_apply(
+
+    // Remove a LoRA adapter from given context
+    // Return -1 if the adapter is not present in the context
+    LLAMA_API int32_t llama_lora_adapter_remove(
             struct llama_context * ctx,
             struct llama_lora_adapter * adapter);
 
+    // Manually free a LoRA adapter
+    // Note: loaded adapters will be free when the associated model is deleted
+    LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
+
     // Apply a loaded control vector to a llama_context, or if data is NULL, clear
     // the currently loaded vector.
     // n_embd should be the size of a single layer's control, and data should point
diff --git a/src/llama.cpp b/src/llama.cpp
index 5f02106d366a2..ee18ca847fde5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2547,30 +2547,6 @@ struct llama_control_vector {
     }
 };
 
-struct lora_weight {
-    struct ggml_tensor * a = nullptr;
-    struct ggml_tensor * b = nullptr;
-    lora_weight() {}
-    lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
-};
-
-struct llama_lora_adapter {
-    // map tensor name to lora_a_b
-    std::map<std::string, lora_weight> ab_map;
-    std::vector<struct ggml_context *> ctxs;
-    std::vector<ggml_backend_buffer_t> bufs;
-    float scale = 1.0f;
-
-    ~llama_lora_adapter() {
-        for (struct ggml_context * ctx : ctxs) {
-            ggml_free(ctx);
-        }
-        for (ggml_backend_buffer_t buf : bufs) {
-            ggml_backend_buffer_free(buf);
-        }
-    }
-};
-
 struct llama_vocab {
     using id    = int32_t;
     using token = std::string;
@@ -2703,9 +2679,8 @@ struct llama_model {
     int64_t t_load_us = 0;
     int64_t t_start_us = 0;
 
-    // used by lora, to save model's original tensors
-    std::vector<struct ggml_tensor *> orig_tensors;
-    std::vector<llama_layer> orig_layers;
+    // keep track of loaded lora adapters
+    std::set<struct llama_lora_adapter *> lora_adapters;
 
     ~llama_model() {
         for (struct ggml_context * ctx : ctxs) {
@@ -2719,6 +2694,9 @@ struct llama_model {
 #endif
             ggml_backend_buffer_free(buf);
         }
+        while (!lora_adapters.empty()) {
+            llama_lora_adapter_free(*lora_adapters.begin());
+        }
     }
 };
 
@@ -2732,10 +2710,6 @@ struct llama_context {
         }
 
         ggml_backend_buffer_free(buf_output);
-
-        for (auto adapter : lora_adapters) {
-            delete adapter;
-        }
     }
 
     llama_cparams cparams;
@@ -2828,8 +2802,50 @@ struct llama_context {
     // control vectors
     struct llama_control_vector cvec;
 
-    // lora adapters
-    std::vector<struct llama_lora_adapter *> lora_adapters;
+    // lora adapters and scales
+    std::map<struct llama_lora_adapter *, float> lora_adapters;
+};
+
+struct lora_weight {
+    struct ggml_tensor * a = nullptr;
+    struct ggml_tensor * b = nullptr;
+    lora_weight() {}
+    lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
+};
+
+struct llama_lora_adapter {
+    struct llama_model * base_model;
+    // map tensor name to lora_a_b
+    std::map<std::string, struct lora_weight> ab_map;
+    std::vector<struct ggml_context *> ctxs;
+    std::vector<ggml_backend_buffer_t> bufs;
+
+    llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
+        base_model->lora_adapters.insert(this);
+    }
+
+    bool has_weight(struct ggml_tensor * w) {
+        std::string name(w->name);
+        return ab_map.find(name) != ab_map.end();
+    }
+
+    lora_weight & get_weight(struct ggml_tensor * w) {
+        std::string name(w->name);
+        return ab_map.at(name);
+    }
+
+    ~llama_lora_adapter() {
+        for (struct ggml_context * ctx : ctxs) {
+            ggml_free(ctx);
+        }
+        for (ggml_backend_buffer_t buf : bufs) {
+            ggml_backend_buffer_free(buf);
+        }
+        auto pos = base_model->lora_adapters.find(this);
+        if (pos != base_model->lora_adapters.end()) {
+            base_model->lora_adapters.erase(pos);
+        }
+    }
 };
 
 static size_t llama_get_device_count(const llama_model & model) {
@@ -7773,6 +7789,32 @@ static void llm_build_kv_store(
     ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
 }
 
+// do mat_mul, while optionally apply lora
+static struct ggml_tensor * llm_build_mm(
+        struct llama_context & lctx,
+         struct ggml_context * ctx0,
+          struct ggml_tensor * w,
+          struct ggml_tensor * cur) {
+    struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
+    for (auto & it : lctx.lora_adapters) {
+        struct llama_lora_adapter * adapter = it.first;
+        float scale = it.second;
+        if (!adapter->has_weight(w)) {
+            continue;
+        }
+        struct lora_weight & lora = adapter->get_weight(w);
+        // TODO: check if lora_a need transpose
+        struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora.a));
+        struct ggml_tensor * ab_cur = ggml_mul_mat(
+            ctx0, lora.b,
+            ggml_mul_mat(ctx0, a, cur)
+        );
+        ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale);
+        res = ggml_add(ctx0, res, ab_cur);
+    }
+    return res;
+}
+
 static struct ggml_tensor * llm_build_norm(
         struct ggml_context * ctx,
          struct ggml_tensor * cur,
@@ -7806,6 +7848,7 @@ static struct ggml_tensor * llm_build_norm(
 }
 
 static struct ggml_tensor * llm_build_ffn(
+       struct llama_context & lctx,
         struct ggml_context * ctx,
          struct ggml_tensor * cur,
          struct ggml_tensor * up,
@@ -7822,7 +7865,7 @@ static struct ggml_tensor * llm_build_ffn(
           llm_ffn_gate_type   type_gate,
          const llm_build_cb & cb,
                         int   il) {
-    struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
+    struct ggml_tensor * tmp = up ? llm_build_mm(lctx, ctx, up, cur) : cur;
     cb(tmp, "ffn_up", il);
 
     if (up_b) {
@@ -7839,12 +7882,12 @@ static struct ggml_tensor * llm_build_ffn(
         switch (type_gate) {
             case LLM_FFN_SEQ:
                 {
-                    cur = ggml_mul_mat(ctx, gate, tmp);
+                    cur = llm_build_mm(lctx, ctx, gate, tmp);
                     cb(cur, "ffn_gate", il);
                 } break;
             case LLM_FFN_PAR:
                 {
-                    cur = ggml_mul_mat(ctx, gate, cur);
+                    cur = llm_build_mm(lctx, ctx, gate, cur);
                     cb(cur, "ffn_gate", il);
                 } break;
         }
@@ -7899,7 +7942,7 @@ static struct ggml_tensor * llm_build_ffn(
     }
 
     if (down) {
-        cur = ggml_mul_mat(ctx, down, cur);
+        cur = llm_build_mm(lctx, ctx, down, cur);
     }
 
     if (down_b) {
@@ -7919,6 +7962,7 @@ static struct ggml_tensor * llm_build_ffn(
 }
 
 static struct ggml_tensor * llm_build_moe_ffn(
+       struct llama_context & lctx,
         struct ggml_context * ctx,
          struct ggml_tensor * cur,
          struct ggml_tensor * gate_inp,
@@ -7936,7 +7980,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
     int64_t n_embd = cur->ne[0];
     int64_t n_tokens = cur->ne[1];
 
-    ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
+    ggml_tensor * logits = llm_build_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
     cb(logits, "ffn_moe_logits", il);
 
     ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
@@ -8019,6 +8063,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
 }
 
 static struct ggml_tensor * llm_build_kqv(
+       struct llama_context & lctx,
         struct ggml_context * ctx,
           const llama_model & model,
         const llama_hparams & hparams,
@@ -8076,7 +8121,7 @@ static struct ggml_tensor * llm_build_kqv(
 
         cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
     } else {
-        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+        struct ggml_tensor * kq = llm_build_mm(lctx, ctx, k, q);
         cb(kq, "kq", il);
 
         if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
@@ -8119,7 +8164,7 @@ static struct ggml_tensor * llm_build_kqv(
                     0);
         cb(v, "v", il);
 
-        struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
+        struct ggml_tensor * kqv = llm_build_mm(lctx, ctx, v, kq);
         cb(kqv, "kqv", il);
 
         struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
@@ -8132,7 +8177,7 @@ static struct ggml_tensor * llm_build_kqv(
     ggml_build_forward_expand(graph, cur);
 
     if (wo) {
-        cur = ggml_mul_mat(ctx, wo, cur);
+        cur = llm_build_mm(lctx, ctx, wo, cur);
     }
 
     if (wo_b) {
@@ -8147,6 +8192,7 @@ static struct ggml_tensor * llm_build_kqv(
 }
 
 static struct ggml_tensor * llm_build_kv(
+       struct llama_context & lctx,
         struct ggml_context * ctx,
           const llama_model & model,
         const llama_hparams & hparams,
@@ -8176,7 +8222,7 @@ static struct ggml_tensor * llm_build_kv(
 
     struct ggml_tensor * cur;
 
-    cur  = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
+    cur  = llm_build_kqv(lctx, ctx, model, hparams, cparams, kv, graph, wo, wo_b,
             q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
     cb(cur, "kqv_out", il);
 
@@ -8638,21 +8684,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -8673,7 +8719,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -8696,7 +8742,7 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -8710,7 +8756,7 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_moe_ffn(ctx0, cur,
+                cur = llm_build_moe_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_gate_inp,
                         model.layers[il].ffn_up_exps,
                         model.layers[il].ffn_gate_exps,
@@ -8740,7 +8786,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -8808,7 +8854,7 @@ struct llm_build_context {
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -8830,7 +8876,7 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -8913,7 +8959,7 @@ struct llm_build_context {
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -8935,7 +8981,7 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -9034,7 +9080,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -9051,7 +9097,7 @@ struct llm_build_context {
 
             // feed forward
             {
-                cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result
+                cur = llm_build_ffn(lctx, ctx0, attn_norm, // !! use the attn norm, not the result
                         model.layers[il].ffn_up,   NULL, NULL,
                         NULL,                      NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -9158,7 +9204,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
@@ -9190,7 +9236,7 @@ struct llm_build_context {
                     LLM_NORM_RMS, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_moe_ffn(ctx0, cur,
+            cur = llm_build_moe_ffn(lctx, ctx0, cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -9308,7 +9354,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -9331,7 +9377,7 @@ struct llm_build_context {
                                  LLM_NORM, cb, il);
             cb(cur, "attn_out_norm", il);
 
-            cur = llm_build_moe_ffn(ctx0, cur,
+            cur = llm_build_moe_ffn(lctx, ctx0, cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -9418,7 +9464,7 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -9442,7 +9488,7 @@ struct llm_build_context {
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -9512,7 +9558,7 @@ struct llm_build_context {
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                 cb(Qcur, "Qcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -9534,7 +9580,7 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -9719,21 +9765,21 @@ struct llm_build_context {
 
             // feed-forward network
             if (model.arch == LLM_ARCH_BERT) {
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
             } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL,                        NULL,
                         model.layers[il].ffn_gate, NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
             } else {
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -9807,7 +9853,7 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -9831,7 +9877,7 @@ struct llm_build_context {
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -9939,13 +9985,13 @@ struct llm_build_context {
                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                     Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
 
-                    cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                    cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                             model.layers[il].wo, model.layers[il].bo,
                             Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 } else {
                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                    cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                    cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                             model.layers[il].wo, model.layers[il].bo,
                             Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 }
@@ -9969,7 +10015,7 @@ struct llm_build_context {
                         model.layers[il].ffn_norm_b,
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -10090,7 +10136,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -10118,7 +10164,7 @@ struct llm_build_context {
                     // parallel residual
                     cur = inpSA;
                 }
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -10209,7 +10255,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -10231,7 +10277,7 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -10323,7 +10369,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -10344,7 +10390,7 @@ struct llm_build_context {
                     LLM_NORM_RMS, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, cur,
+            cur = llm_build_ffn(lctx, ctx0, cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -10438,7 +10484,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -10461,7 +10507,7 @@ struct llm_build_context {
             cb(cur, "ffn_norm", il);
 
             ggml_tensor * moe_out =
-                    llm_build_moe_ffn(ctx0, cur,
+                    llm_build_moe_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_gate_inp,
                         model.layers[il].ffn_up_exps,
                         model.layers[il].ffn_gate_exps,
@@ -10481,7 +10527,7 @@ struct llm_build_context {
                 ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
                 cb(cur_gate, "ffn_shexp_gate", il);
 
-                ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
+                ggml_tensor * cur_ffn = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up_shexp,   NULL, NULL,
                         model.layers[il].ffn_gate_shexp, NULL, NULL,
                         model.layers[il].ffn_down_shexp, NULL, NULL,
@@ -10595,7 +10641,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
@@ -10610,7 +10656,7 @@ struct llm_build_context {
 
             // FF
             {
-                ffn_output = llm_build_ffn(ctx0, attn_norm_output,
+                ffn_output = llm_build_ffn(lctx, ctx0, attn_norm_output,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -10715,7 +10761,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
@@ -10830,7 +10876,7 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -10848,7 +10894,7 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -10932,7 +10978,7 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -10956,7 +11002,7 @@ struct llm_build_context {
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -11043,7 +11089,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -11067,7 +11113,7 @@ struct llm_build_context {
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -11163,7 +11209,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -11184,7 +11230,7 @@ struct llm_build_context {
                     LLM_NORM, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, cur,
+            cur = llm_build_ffn(lctx, ctx0, cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -11281,7 +11327,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -11302,7 +11348,7 @@ struct llm_build_context {
                     LLM_NORM_RMS, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, cur,
+            cur = llm_build_ffn(lctx, ctx0, cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -11412,7 +11458,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -11439,7 +11485,7 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -11534,7 +11580,7 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
@@ -11556,7 +11602,7 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -11647,7 +11693,7 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
@@ -11674,7 +11720,7 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -11784,7 +11830,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -11806,7 +11852,7 @@ struct llm_build_context {
                     LLM_NORM, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, cur,
+            cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -12077,7 +12123,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -12094,7 +12140,7 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, ffn_inp,
+                cur = llm_build_ffn(lctx, ctx0, ffn_inp,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -12209,7 +12255,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -12231,7 +12277,7 @@ struct llm_build_context {
                     LLM_NORM, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, cur,
+            cur = llm_build_ffn(lctx, ctx0, cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -12336,7 +12382,7 @@ struct llm_build_context {
                 Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
                 cb(Qcur, "Vcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -12358,7 +12404,7 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -12445,7 +12491,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -12470,7 +12516,7 @@ struct llm_build_context {
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -12501,7 +12547,7 @@ struct llm_build_context {
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -12588,7 +12634,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -12610,7 +12656,7 @@ struct llm_build_context {
                     LLM_NORM_RMS, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, cur,
+            cur = llm_build_ffn(lctx, ctx0, cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -12627,7 +12673,7 @@ struct llm_build_context {
                     LLM_NORM_RMS, cb, il);
             cb(cur, "ffn_norm_exps", il);
 
-            cur = llm_build_moe_ffn(ctx0, cur,
+            cur = llm_build_moe_ffn(lctx, ctx0, cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -12810,7 +12856,7 @@ struct llm_build_context {
                 struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
                 cb(k_states, "k_states", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
             }
@@ -12832,7 +12878,7 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -12847,7 +12893,7 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 ggml_tensor * moe_out =
-                        llm_build_moe_ffn(ctx0, cur,
+                        llm_build_moe_ffn(lctx, ctx0, cur,
                             model.layers[il].ffn_gate_inp,
                             model.layers[il].ffn_up_exps,
                             model.layers[il].ffn_gate_exps,
@@ -12860,7 +12906,7 @@ struct llm_build_context {
 
                 // FFN shared expert
                 {
-                    ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
+                    ggml_tensor * ffn_shexp = llm_build_ffn(lctx, ctx0, cur,
                             model.layers[il].ffn_up_shexp,   NULL, NULL,
                             model.layers[il].ffn_gate_shexp, NULL, NULL,
                             model.layers[il].ffn_down_shexp, NULL, NULL,
@@ -12965,7 +13011,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         NULL, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
@@ -12998,7 +13044,7 @@ struct llm_build_context {
                     LLM_NORM_RMS, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, cur,
+            cur = llm_build_ffn(lctx, ctx0, cur,
                     model.layers[il].ffn_up,   NULL, model.layers[il].ffn_up_scale,
                     model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
                     NULL,                      NULL, NULL,
@@ -13132,7 +13178,7 @@ struct llm_build_context {
                     cb(cur, "ffn_norm", il);
 
                     // T5 uses relu, flan-T5 uses gelu-gated
-                    cur = llm_build_ffn(ctx0, cur,
+                    cur = llm_build_ffn(lctx, ctx0, cur,
                             model.layers[il].ffn_up_enc,   NULL, NULL,
                             model.layers[il].ffn_gate_enc, NULL, NULL,
                             model.layers[il].ffn_down_enc, NULL, NULL,
@@ -13310,7 +13356,7 @@ struct llm_build_context {
                     cb(cur, "ffn_norm", il);
 
                     // T5 uses relu, flan-T5 uses gelu-gated
-                    cur = llm_build_ffn(ctx0, cur,
+                    cur = llm_build_ffn(lctx, ctx0, cur,
                             model.layers[il].ffn_up,   NULL, NULL,
                             model.layers[il].ffn_gate, NULL, NULL,
                             model.layers[il].ffn_down, NULL, NULL,
@@ -13392,7 +13438,7 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il);
             }
@@ -13416,7 +13462,7 @@ struct llm_build_context {
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -18278,7 +18324,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     }
 }
 
-static int llama_lora_adapter_init_internal(const struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
+static int llama_lora_adapter_init_internal(const struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
     static const int n_inp_tensors = 5; // see llama_model
     static const int n_out_tensors = 5; // see llama_model
     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
@@ -18310,11 +18356,11 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
 
     // count layer buffer types
     std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
-    for (int64_t i = 0; i < model.hparams.n_layer; i++) {
-        buft_tensor_count[model.buft_layer[i].buft] += n_tensors_per_layer;
+    for (int64_t i = 0; i < model->hparams.n_layer; i++) {
+        buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer;
     }
-    buft_tensor_count[model.buft_input.buft]  += n_inp_tensors;
-    buft_tensor_count[model.buft_output.buft] += n_out_tensors;
+    buft_tensor_count[model->buft_input.buft]  += n_inp_tensors;
+    buft_tensor_count[model->buft_output.buft] += n_out_tensors;
 
     // allocate contexts
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -18371,11 +18417,11 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
         sscanf(cname, "blk.%d.", &il);
         struct ggml_context * dev_ctx; // device ctx
         if (il >= 0) {
-            dev_ctx = ctx_map.at(model.buft_layer[il].buft);
+            dev_ctx = ctx_map.at(model->buft_layer[il].buft);
         } else if (strstr(cname, "tok") == 0) {
-            dev_ctx = ctx_map.at(model.buft_input.buft);
+            dev_ctx = ctx_map.at(model->buft_input.buft);
         } else if (strstr(cname, "output") == 0) {
-            dev_ctx = ctx_map.at(model.buft_output.buft);
+            dev_ctx = ctx_map.at(model->buft_output.buft);
         } else {
             LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname);
             continue;
@@ -18436,134 +18482,27 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
     return 0;
 }
 
-int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_adapter * adapter) {
-    GGML_ASSERT(!lctx->lora_adapters.empty());
-    const struct llama_model & model = lctx->model;
-    struct ggml_init_params ctx0_params = {
-        /*.mem_size   =*/ lctx->buf_compute_meta.size(),
-        /*.mem_buffer =*/ lctx->buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-    struct ggml_context * ctx0 = ggml_init(ctx0_params);
-
-    // map "merged.%s" name to model tensor
-    std::map<std::string, struct ggml_tensor *> output_map;
-    // apply lora for model tensors
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-    auto apply_lora = [&](struct llama_lora_adapter * adapter, struct ggml_tensor * model_tensor) {
-        if (model_tensor == nullptr) {
-            return;
-        }
-        std::string name = ggml_get_name(model_tensor);
-        if (adapter->ab_map.find(name) != adapter->ab_map.end()) {
-            auto lora_w = adapter->ab_map[name];
-            struct ggml_tensor * cur = ggml_mul_mat(ctx0, lora_w.a, lora_w.b);
-            cur = ggml_scale_inplace(ctx0, cur, adapter->scale);
-            cur = ggml_add(ctx0, cur, model_tensor);
-            ggml_format_name(cur, "merged.%s", name.c_str());
-            ggml_build_forward_expand(gf, cur);
-            output_map[std::string(cur->name)] = model_tensor;
-        }
-    };
-    apply_lora(adapter, model.tok_embd);
-    apply_lora(adapter, model.type_embd);
-    apply_lora(adapter, model.pos_embd);
-    apply_lora(adapter, model.tok_norm);
-    apply_lora(adapter, model.tok_norm_b);
-    apply_lora(adapter, model.output_norm);
-    apply_lora(adapter, model.output_norm_b);
-    apply_lora(adapter, model.output);
-    apply_lora(adapter, model.output_b);
-    apply_lora(adapter, model.output_norm_enc);
-    for (const llama_layer & layer : model.layers) {
-        apply_lora(adapter, layer.attn_norm);
-        apply_lora(adapter, layer.attn_norm_b);
-        apply_lora(adapter, layer.attn_norm_2);
-        apply_lora(adapter, layer.attn_norm_2_b);
-        apply_lora(adapter, layer.attn_q_norm);
-        apply_lora(adapter, layer.attn_q_norm_b);
-        apply_lora(adapter, layer.attn_k_norm);
-        apply_lora(adapter, layer.attn_k_norm_b);
-        apply_lora(adapter, layer.attn_out_norm);
-        apply_lora(adapter, layer.attn_out_norm_b);
-        apply_lora(adapter, layer.attn_q_a_norm);
-        apply_lora(adapter, layer.attn_kv_a_norm);
-        apply_lora(adapter, layer.attn_sub_norm);
-        apply_lora(adapter, layer.attn_post_norm);
-        apply_lora(adapter, layer.ffn_sub_norm);
-        apply_lora(adapter, layer.attn_norm_cross);
-        apply_lora(adapter, layer.attn_norm_enc);
-
-        apply_lora(adapter, layer.wq);
-        apply_lora(adapter, layer.wk);
-        apply_lora(adapter, layer.wv);
-        apply_lora(adapter, layer.wo);
-        apply_lora(adapter, layer.wqkv);
-        apply_lora(adapter, layer.wq_a);
-        apply_lora(adapter, layer.wq_b);
-        apply_lora(adapter, layer.wkv_a_mqa);
-        apply_lora(adapter, layer.wkv_b);
-
-        apply_lora(adapter, layer.bq);
-        apply_lora(adapter, layer.bk);
-        apply_lora(adapter, layer.bv);
-        apply_lora(adapter, layer.bo);
-        apply_lora(adapter, layer.bqkv);
-
-        apply_lora(adapter, layer.ffn_norm);
-        apply_lora(adapter, layer.ffn_norm_b);
-        apply_lora(adapter, layer.ffn_post_norm);
-        apply_lora(adapter, layer.layer_out_norm);
-        apply_lora(adapter, layer.layer_out_norm_b);
-        apply_lora(adapter, layer.ffn_norm_exps);
-        apply_lora(adapter, layer.ffn_norm_enc);
-
-        apply_lora(adapter, layer.ffn_gate);
-        apply_lora(adapter, layer.ffn_down);
-        apply_lora(adapter, layer.ffn_up);
-        apply_lora(adapter, layer.ffn_gate_enc);
-        apply_lora(adapter, layer.ffn_down_enc);
-        apply_lora(adapter, layer.ffn_up_enc);
-
-        apply_lora(adapter, layer.ffn_gate_inp);
-        apply_lora(adapter, layer.ffn_gate_exps);
-        apply_lora(adapter, layer.ffn_down_exps);
-        apply_lora(adapter, layer.ffn_up_exps);
-
-        apply_lora(adapter, layer.ffn_gate_inp_shexp);
-        apply_lora(adapter, layer.ffn_gate_shexp);
-        apply_lora(adapter, layer.ffn_down_shexp);
-        apply_lora(adapter, layer.ffn_up_shexp);
-
-        apply_lora(adapter, layer.ffn_gate_b);
-        apply_lora(adapter, layer.ffn_down_b);
-        apply_lora(adapter, layer.ffn_up_b);
-        apply_lora(adapter, layer.ffn_act);
-    }
-
-    // merge lora to model weight
-    ggml_status res = ggml_backend_sched_graph_compute(lctx->sched, gf);
-    if (res == GGML_STATUS_SUCCESS) {
-        // graph maybe realloc, we need to find correct gf->nodes based on name
-        size_t n_merged = 0;
-        for (int i = 0; i < gf->n_nodes; ++i) {
-            auto node = gf->nodes[i];
-            std::string name(node->name);
-            if (output_map.find(name) != output_map.end()) {
-                struct ggml_tensor * model_tensor = output_map[name];
-                ggml_backend_tensor_copy(node, model_tensor);
-                n_merged++;
-            }
-        }
-        GGML_ASSERT(n_merged == output_map.size());
-        LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, n_merged);
-    } else {
-        LLAMA_LOG_ERROR("%s: compute error while merging lora weights to model, result = %d\n", __func__, res);
-        return res;
+int32_t llama_lora_adapter_set(
+            struct llama_context * ctx,
+            struct llama_lora_adapter * adapter,
+            float scale) {
+    ctx->lora_adapters[adapter] = scale;
+    return 0;
+}
+
+int32_t llama_lora_adapter_remove(
+            struct llama_context * ctx,
+            struct llama_lora_adapter * adapter) {
+    auto pos = ctx->lora_adapters.find(adapter);
+    if (pos != ctx->lora_adapters.end()) {
+        ctx->lora_adapters.erase(pos);
+        return 0;
     }
+    return -1;
+}
 
-    ggml_free(ctx0);
-    return 0;
+void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
+    delete adapter;
 }
 
 //
@@ -19343,17 +19282,11 @@ uint32_t llama_model_quantize(
     }
 }
 
-struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora, float scale) {
+struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
     try {
-        struct llama_lora_adapter * adapter = new llama_lora_adapter;
-        adapter->scale = scale;
-        int res = llama_lora_adapter_init_internal(ctx->model, path_lora, *adapter);
-        if (res == 0) {
-            ctx->lora_adapters.push_back(adapter);
-            return adapter;
-        } else {
-            return nullptr;
-        }
+        struct llama_lora_adapter * adapter = new llama_lora_adapter(model);
+        int res = llama_lora_adapter_init_internal(model, path_lora, *adapter);
+        return res == 0 ? adapter : nullptr;
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
         return nullptr;

From 30faf1f3def8ce627225f2401fb403d95907a47d Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Sun, 7 Jul 2024 16:36:50 +0200
Subject: [PATCH 07/33] fix auto merge

---
 src/llama.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 7c79e4900dfca..ffc8ffbd23740 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -10863,7 +10863,7 @@ struct llm_build_context {
             // special-case: the up and gate tensors are merged into a single tensor
             // TOOD: support into llm_build_ffn
             {
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         NULL,                      NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -13622,7 +13622,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur_rope", il);
 
-                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
@@ -13647,7 +13647,7 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                cur = llm_build_ffn(lctx, ctx0, cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         NULL,                      NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,

From 79e2982788b0102aabb098b1a3d6227a7e32a483 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 8 Jul 2024 11:59:01 +0200
Subject: [PATCH 08/33] update based on review comments

---
 src/llama.cpp | 106 +++++++++++++++++++++++++-------------------------
 1 file changed, 54 insertions(+), 52 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index ffc8ffbd23740..a4ceb0959caa2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2821,20 +2821,20 @@ struct llama_context {
     struct llama_control_vector cvec;
 
     // lora adapters and scales
-    std::map<struct llama_lora_adapter *, float> lora_adapters;
+    std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
 };
 
-struct lora_weight {
+struct llama_lora_weight {
     struct ggml_tensor * a = nullptr;
     struct ggml_tensor * b = nullptr;
-    lora_weight() {}
-    lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
+    llama_lora_weight() {}
+    llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
 };
 
 struct llama_lora_adapter {
     struct llama_model * base_model;
     // map tensor name to lora_a_b
-    std::map<std::string, struct lora_weight> ab_map;
+    std::unordered_map<std::string, struct llama_lora_weight> ab_map;
     std::vector<struct ggml_context *> ctxs;
     std::vector<ggml_backend_buffer_t> bufs;
 
@@ -2842,14 +2842,13 @@ struct llama_lora_adapter {
         base_model->lora_adapters.insert(this);
     }
 
-    bool has_weight(struct ggml_tensor * w) {
+    llama_lora_weight * get_weight(struct ggml_tensor * w) {
         std::string name(w->name);
-        return ab_map.find(name) != ab_map.end();
-    }
-
-    lora_weight & get_weight(struct ggml_tensor * w) {
-        std::string name(w->name);
-        return ab_map.at(name);
+        auto pos = ab_map.find(name);
+        if (ab_map.find(name) != ab_map.end()) {
+            return &pos->second;
+        }
+        return nullptr;
     }
 
     ~llama_lora_adapter() {
@@ -7855,23 +7854,22 @@ static void llm_build_kv_store(
 }
 
 // do mat_mul, while optionally apply lora
-static struct ggml_tensor * llm_build_mm(
+static struct ggml_tensor * llm_build_lora_mm(
         struct llama_context & lctx,
          struct ggml_context * ctx0,
           struct ggml_tensor * w,
           struct ggml_tensor * cur) {
     struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
     for (auto & it : lctx.lora_adapters) {
-        struct llama_lora_adapter * adapter = it.first;
+        struct llama_lora_weight * lora = it.first->get_weight(w);
         float scale = it.second;
-        if (!adapter->has_weight(w)) {
+        if (lora == nullptr) {
             continue;
         }
-        struct lora_weight & lora = adapter->get_weight(w);
         // TODO: check if lora_a need transpose
-        struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora.a));
+        struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora->a));
         struct ggml_tensor * ab_cur = ggml_mul_mat(
-            ctx0, lora.b,
+            ctx0, lora->b,
             ggml_mul_mat(ctx0, a, cur)
         );
         ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale);
@@ -7930,7 +7928,7 @@ static struct ggml_tensor * llm_build_ffn(
           llm_ffn_gate_type   type_gate,
          const llm_build_cb & cb,
                         int   il) {
-    struct ggml_tensor * tmp = up ? llm_build_mm(lctx, ctx, up, cur) : cur;
+    struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur;
     cb(tmp, "ffn_up", il);
 
     if (up_b) {
@@ -7947,12 +7945,12 @@ static struct ggml_tensor * llm_build_ffn(
         switch (type_gate) {
             case LLM_FFN_SEQ:
                 {
-                    cur = llm_build_mm(lctx, ctx, gate, tmp);
+                    cur = llm_build_lora_mm(lctx, ctx, gate, tmp);
                     cb(cur, "ffn_gate", il);
                 } break;
             case LLM_FFN_PAR:
                 {
-                    cur = llm_build_mm(lctx, ctx, gate, cur);
+                    cur = llm_build_lora_mm(lctx, ctx, gate, cur);
                     cb(cur, "ffn_gate", il);
                 } break;
         }
@@ -8020,7 +8018,7 @@ static struct ggml_tensor * llm_build_ffn(
     }
 
     if (down) {
-        cur = llm_build_mm(lctx, ctx, down, cur);
+        cur = llm_build_lora_mm(lctx, ctx, down, cur);
     }
 
     if (down_b) {
@@ -8058,7 +8056,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
     int64_t n_embd = cur->ne[0];
     int64_t n_tokens = cur->ne[1];
 
-    ggml_tensor * logits = llm_build_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
+    ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
     cb(logits, "ffn_moe_logits", il);
 
     ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
@@ -8199,7 +8197,7 @@ static struct ggml_tensor * llm_build_kqv(
 
         cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
     } else {
-        struct ggml_tensor * kq = llm_build_mm(lctx, ctx, k, q);
+        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
         cb(kq, "kq", il);
 
         if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
@@ -8242,7 +8240,7 @@ static struct ggml_tensor * llm_build_kqv(
                     0);
         cb(v, "v", il);
 
-        struct ggml_tensor * kqv = llm_build_mm(lctx, ctx, v, kq);
+        struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
         cb(kqv, "kqv", il);
 
         struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
@@ -8255,7 +8253,7 @@ static struct ggml_tensor * llm_build_kqv(
     ggml_build_forward_expand(graph, cur);
 
     if (wo) {
-        cur = llm_build_mm(lctx, ctx, wo, cur);
+        cur = llm_build_lora_mm(lctx, ctx, wo, cur);
     }
 
     if (wo_b) {
@@ -8762,21 +8760,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -8864,7 +8862,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_mm(lctx, ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -18517,7 +18515,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     }
 }
 
-static int llama_lora_adapter_init_internal(const struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
+static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
     static const int n_inp_tensors = 5; // see llama_model
     static const int n_out_tensors = 5; // see llama_model
     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
@@ -18532,7 +18530,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
     struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
     if (!ctx_gguf) {
         LLAMA_LOG_ERROR("%s: failed to load lora adapter file from %s\n", __func__, path_lora);
-        return -1;
+        throw std::exception();
     }
 
     // calculate n_tensors_per_layer
@@ -18574,7 +18572,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
     }
 
     // bundle lora_a and lora_b into pairs
-    std::map<std::string, lora_weight> ab_map;
+    std::map<std::string, llama_lora_weight> ab_map;
     auto str_endswith = [](const std::string & str, const std::string & suffix) {
         return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
     };
@@ -18583,18 +18581,19 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
         if (str_endswith(name, ".lora_a")) {
             replace_all(name, ".lora_a", "");
             if (ab_map.find(name) == ab_map.end()) {
-                ab_map[name] = lora_weight(cur, nullptr);
+                ab_map[name] = llama_lora_weight(cur, nullptr);
             } else {
                 ab_map[name].a = cur;
             }
         } else if (str_endswith(name, ".lora_b")) {
             replace_all(name, ".lora_b", "");
             if (ab_map.find(name) == ab_map.end()) {
-                ab_map[name] = lora_weight(nullptr, cur);
+                ab_map[name] = llama_lora_weight(nullptr, cur);
             } else {
                 ab_map[name].b = cur;
             }
         } else {
+            // maybe "optimizer.*"" tensors
             LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name);
         }
     }
@@ -18603,28 +18602,26 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
     for (auto & it : ab_map) {
         std::string name = it.first;
         const char * cname = name.c_str();
-        lora_weight & w = it.second;
+        llama_lora_weight & w = it.second;
         GGML_ASSERT(w.a != nullptr);
         GGML_ASSERT(w.b != nullptr);
         int il = -1;
         sscanf(cname, "blk.%d.", &il);
-        struct ggml_context * dev_ctx; // device ctx
-        if (il >= 0) {
-            dev_ctx = ctx_map.at(model->buft_layer[il].buft);
-        } else if (strstr(cname, "tok") == 0) {
-            dev_ctx = ctx_map.at(model->buft_input.buft);
-        } else if (strstr(cname, "output") == 0) {
-            dev_ctx = ctx_map.at(model->buft_output.buft);
-        } else {
-            LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname);
-            continue;
+        // device buft and device ctx
+        auto model_tensor = llama_get_model_tensor(model, cname);
+        if (!model_tensor) {
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model\n");
         }
+        struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer));
+        // TODO: validate tensor shape
         // LLAMA_LOG_INFO("%s %p %p\n", cname, w.a, w.b);
         struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
         struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
         ggml_set_name(tensor_a, w.a->name);
         ggml_set_name(tensor_b, w.b->name);
-        adapter.ab_map[name] = lora_weight(tensor_a, tensor_b);
+        adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
     }
 
     // allocate tensors / buffers and zero
@@ -18636,8 +18633,9 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
             ggml_context * ctx_dev = it.second;
             ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft);
             if (!buf) {
-                LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__);
-                return -1;
+                gguf_free(ctx_gguf);
+                ggml_free(ctx);
+                throw std::runtime_error("failed to allocate buffer for lora adapter\n");
             }
             ggml_backend_buffer_clear(buf, 0);
             adapter.ctxs.push_back(ctx_dev);
@@ -18671,14 +18669,18 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
     LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 
     // free ctx for reading gguf
+    gguf_free(ctx_gguf);
     ggml_free(ctx);
-    return 0;
 }
 
 int32_t llama_lora_adapter_set(
             struct llama_context * ctx,
             struct llama_lora_adapter * adapter,
             float scale) {
+    if (ctx->cparams.flash_attn) {
+        LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
+        return -1;
+    }
     ctx->lora_adapters[adapter] = scale;
     return 0;
 }
@@ -19479,8 +19481,8 @@ uint32_t llama_model_quantize(
 struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
     try {
         struct llama_lora_adapter * adapter = new llama_lora_adapter(model);
-        int res = llama_lora_adapter_init_internal(model, path_lora, *adapter);
-        return res == 0 ? adapter : nullptr;
+        llama_lora_adapter_init_internal(model, path_lora, *adapter);
+        return adapter;
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
         return nullptr;

From 847135aaa25ae999060ddb8431f5d529f9244389 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 8 Jul 2024 16:35:27 +0200
Subject: [PATCH 09/33] add convert script

---
 convert_lora_to_gguf.py | 149 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100755 convert_lora_to_gguf.py

diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
new file mode 100755
index 0000000000000..9a5c7a2c8f916
--- /dev/null
+++ b/convert_lora_to_gguf.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from __future__ import annotations
+
+import logging
+import argparse
+import contextlib
+import json
+import os
+import re
+import sys
+import types
+from enum import IntEnum
+from pathlib import Path
+from hashlib import sha256
+from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
+
+import math
+import numpy as np
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf
+
+# reuse model definitions from convert_hf_to_gguf.py
+from convert_hf_to_gguf import Model
+
+logger = logging.getLogger("lora-to-gguf")
+
+def parse_args() -> argparse.Namespace:
+    all_models = ", ".join([arch for arch in Model._model_classes.keys()])
+    parser = argparse.ArgumentParser(
+        description="Convert a huggingface model to a GGML compatible file")
+    parser.add_argument(
+        "--outfile", type=Path,
+        help="path to write to; default: based on input.",
+    )
+    parser.add_argument(
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+    )
+    parser.add_argument(
+        "--arch", type=str,
+        help=f"Arch of the base model, must be one of: {all_models} (default: LlamaForCausalLM)",
+        default="LlamaForCausalLM"
+    )
+    parser.add_argument(
+        "--bigendian", action="store_true",
+        help="model is executed on big endian machine",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true",
+        help="increase output verbosity",
+    )
+    parser.add_argument(
+        "--base", type=Path, required=True,
+        help="directory containing base model file",
+    )
+    parser.add_argument(
+        "lora_path", type=Path,
+        help="directory containing LoRA adapter file",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    # FIXME: outtype is not working
+    ftype_map: dict[str, gguf.LlamaFileType] = {
+        "f32": gguf.LlamaFileType.ALL_F32,
+        "f16": gguf.LlamaFileType.MOSTLY_F16,
+        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
+        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+        "auto": gguf.LlamaFileType.GUESSED,
+    }
+
+    dir_base_model = args.base
+    dir_lora = args.lora_path
+    input_json = os.path.join(dir_lora, "adapter_config.json")
+    input_model = os.path.join(dir_lora, "adapter_model.bin")
+    if args.outfile is not None:
+        fname_out = args.outfile
+    else:
+        # output in the same directory as the model by default
+        fname_out = dir_lora / 'ggml-lora.gguf'
+
+    if os.path.exists(input_model):
+        lora_model = torch.load(input_model, map_location="cpu")
+    else:
+        input_model = os.path.join(dir_lora, "adapter_model.safetensors")
+        # lazy import load_file only if lora is in safetensors format.
+        from safetensors.torch import load_file
+        lora_model = load_file(input_model, device="cpu")
+
+    # load base model
+    logger.info(f"Loading base model: {dir_base_model.name}")
+    hparams = Model.load_hparams(dir_base_model)
+    with torch.inference_mode():
+        try:
+            model_class = Model.from_model_architecture(hparams["architectures"][0])
+        except NotImplementedError:
+            logger.error(f"Model {hparams['architectures'][0]} is not supported")
+            sys.exit(1)
+
+        model_instance = model_class(dir_base_model, ftype_map[args.outtype], fname_out, args.bigendian, False, False, None)
+        logger.info("Set model parameters")
+        model_instance.set_gguf_parameters()
+
+        # adapter_config = json.load(input_json)
+        model_instance.gguf_writer.add_string("training.type", "finetune_lora")
+
+    map_tensors: dict[str, Tensor] = {}
+    for tensor_name, tensor in lora_model.items():
+        orig_name = tensor_name.replace("base_model.model.", "")
+        orig_name = orig_name.replace(".lora_A.weight", ".weight")
+        orig_name = orig_name.replace(".lora_B.weight", ".weight")
+        is_lora_a = ".lora_A.weight" in tensor_name
+        is_lora_b = ".lora_B.weight" in tensor_name
+        if not is_lora_a and not is_lora_b:
+            logger.error(f"Unexpected name '{tensor_name}': Not a lora_A or lora_B tensor")
+            sys.exit(1)
+        dest_name = model_instance.map_tensor_name(orig_name)
+        dest_name = f"{dest_name}.lora_a" if is_lora_a else f"{dest_name}.lora_b"
+        # logger.info(f"{orig_name} --> {dest_name}")
+        map_tensors[dest_name] = tensor
+
+    # overwrite method
+    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+        for name, tensor in map_tensors.items():
+            yield (name, tensor)
+
+    # overwrite method
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        return [(name, data_torch)]
+
+    model_instance.get_tensors = types.MethodType(get_tensors, model_instance)
+    model_instance.modify_tensors = types.MethodType(modify_tensors, model_instance)
+    model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
+    logger.info("Exporting model...")
+    model_instance.write()
+    logger.info(f"Model successfully exported to {fname_out}")

From 712fecba61b803fc324004220d7bb782240dcba6 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 8 Jul 2024 16:48:55 +0200
Subject: [PATCH 10/33] no more transpose A

---
 src/llama.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index a4ceb0959caa2..b42cc5fb4837d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7867,10 +7867,9 @@ static struct ggml_tensor * llm_build_lora_mm(
             continue;
         }
         // TODO: check if lora_a need transpose
-        struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora->a));
         struct ggml_tensor * ab_cur = ggml_mul_mat(
             ctx0, lora->b,
-            ggml_mul_mat(ctx0, a, cur)
+            ggml_mul_mat(ctx0, lora->a, cur)
         );
         ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale);
         res = ggml_add(ctx0, res, ab_cur);

From 84288ff9f7e945bb730bb0df069ecf2054ba6076 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 8 Jul 2024 17:05:17 +0200
Subject: [PATCH 11/33] add f16 convert

---
 convert_lora_to_gguf.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 9a5c7a2c8f916..36ccb73cfc333 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -139,10 +139,17 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
 
     # overwrite method
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
         return [(name, data_torch)]
 
+    # overwrite method
+    def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
+        del name, new_name, bid, n_dims  # unused
+        return True
+
     model_instance.get_tensors = types.MethodType(get_tensors, model_instance)
     model_instance.modify_tensors = types.MethodType(modify_tensors, model_instance)
+    model_instance.extra_f16_tensors = types.MethodType(extra_f16_tensors, model_instance)
     model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
     logger.info("Exporting model...")
     model_instance.write()

From 0e1618898599abe2890469a30305697f7c791a52 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 8 Jul 2024 17:44:14 +0200
Subject: [PATCH 12/33] add metadata check

---
 src/llama.cpp | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index b42cc5fb4837d..ad11ef4943064 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -371,6 +371,8 @@ enum llm_kv {
     LLM_KV_TOKENIZER_SUFFIX_ID,
     LLM_KV_TOKENIZER_MIDDLE_ID,
     LLM_KV_TOKENIZER_EOT_ID,
+
+    LLM_KV_TRAINING_TYPE,
 };
 
 static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -464,6 +466,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_SUFFIX_ID,            "tokenizer.ggml.suffix_token_id"          },
     { LLM_KV_TOKENIZER_MIDDLE_ID,            "tokenizer.ggml.middle_token_id"          },
     { LLM_KV_TOKENIZER_EOT_ID,               "tokenizer.ggml.eot_token_id"             },
+
+    { LLM_KV_TRAINING_TYPE,                  "training.type"                     },
 };
 
 struct LLM_KV {
@@ -18519,8 +18523,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
     static const int n_out_tensors = 5; // see llama_model
     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
 
-    // TODO: check lora base model arch
-
     ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
         /* .no_alloc = */ false,
@@ -18532,6 +18534,25 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         throw std::exception();
     }
 
+    // check metadata
+    {
+        auto get_kv_str = [&](std::string key) -> std::string {
+            std::vector<char> str_buf(32, 0); // we only get the arch, so no need big buffer here
+            int id = gguf_find_key(ctx_gguf, key.c_str());
+            return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
+        };
+        LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
+        auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
+        auto lora_arch = llm_arch_from_string(lora_arch_name);
+        if (lora_arch != model->arch) {
+            throw std::runtime_error("model arch and LoRA arch mismatch");
+        }
+        auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE));
+        if (train_type != "finetune_lora") {
+            throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type);
+        }
+    }
+
     // calculate n_tensors_per_layer
     int n_tensors_per_layer = 0;
     {
@@ -18542,7 +18563,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
             if (il == 0) n_tensors_per_layer++;
         }
     }
-    // printf("n_tensors_per_layer %d\n", n_tensors_per_layer);
 
     // count layer buffer types
     std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;

From 6c617e20efc2a8020b99ebdbe4721f17c2c34485 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 8 Jul 2024 21:36:35 +0200
Subject: [PATCH 13/33] add sanity check

---
 src/llama.cpp | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index ad11ef4943064..278c7912d4752 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -467,7 +467,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_MIDDLE_ID,            "tokenizer.ggml.middle_token_id"          },
     { LLM_KV_TOKENIZER_EOT_ID,               "tokenizer.ggml.eot_token_id"             },
 
-    { LLM_KV_TRAINING_TYPE,                  "training.type"                     },
+    { LLM_KV_TRAINING_TYPE,                  "training.type" },
 };
 
 struct LLM_KV {
@@ -18521,7 +18521,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
     static const int n_inp_tensors = 5; // see llama_model
     static const int n_out_tensors = 5; // see llama_model
-    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
+    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);
 
     ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
@@ -18530,8 +18530,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
     };
     struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
     if (!ctx_gguf) {
-        LLAMA_LOG_ERROR("%s: failed to load lora adapter file from %s\n", __func__, path_lora);
-        throw std::exception();
+        throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
     }
 
     // check metadata
@@ -18631,11 +18630,17 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         if (!model_tensor) {
             gguf_free(ctx_gguf);
             ggml_free(ctx);
-            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model\n");
+            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
         }
         struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer));
-        // TODO: validate tensor shape
-        // LLAMA_LOG_INFO("%s %p %p\n", cname, w.a, w.b);
+        // validate tensor shape
+        if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
+            throw std::runtime_error("tensor '" + name + "' has incorrect shape");
+        }
+        if (w.a->ne[1] != w.b->ne[0]) {
+            throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
+        }
+        // save tensor to adapter
         struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
         struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
         ggml_set_name(tensor_a, w.a->name);

From 7a83f200d353db68fef8458017c7db17b0a303c4 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 8 Jul 2024 21:55:41 +0200
Subject: [PATCH 14/33] fix ftype

---
 convert_lora_to_gguf.py | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 36ccb73cfc333..861ab1e97f536 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -5,19 +5,12 @@
 
 import logging
 import argparse
-import contextlib
-import json
 import os
-import re
 import sys
 import types
-from enum import IntEnum
 from pathlib import Path
-from hashlib import sha256
-from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
+from typing import TYPE_CHECKING, Iterable, Iterator
 
-import math
-import numpy as np
 import torch
 
 if TYPE_CHECKING:
@@ -32,22 +25,17 @@
 
 logger = logging.getLogger("lora-to-gguf")
 
+
 def parse_args() -> argparse.Namespace:
-    all_models = ", ".join([arch for arch in Model._model_classes.keys()])
     parser = argparse.ArgumentParser(
-        description="Convert a huggingface model to a GGML compatible file")
+        description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
     parser.add_argument(
         "--outfile", type=Path,
         help="path to write to; default: based on input.",
     )
     parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
-    )
-    parser.add_argument(
-        "--arch", type=str,
-        help=f"Arch of the base model, must be one of: {all_models} (default: LlamaForCausalLM)",
-        default="LlamaForCausalLM"
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0",
     )
     parser.add_argument(
         "--bigendian", action="store_true",
@@ -73,14 +61,13 @@ def parse_args() -> argparse.Namespace:
     args = parse_args()
     logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
 
-    # FIXME: outtype is not working
     ftype_map: dict[str, gguf.LlamaFileType] = {
         "f32": gguf.LlamaFileType.ALL_F32,
         "f16": gguf.LlamaFileType.MOSTLY_F16,
         "bf16": gguf.LlamaFileType.MOSTLY_BF16,
         "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
-        "auto": gguf.LlamaFileType.GUESSED,
     }
+    ftype = ftype_map[args.outtype]
 
     dir_base_model = args.base
     dir_lora = args.lora_path
@@ -110,7 +97,7 @@ def parse_args() -> argparse.Namespace:
             logger.error(f"Model {hparams['architectures'][0]} is not supported")
             sys.exit(1)
 
-        model_instance = model_class(dir_base_model, ftype_map[args.outtype], fname_out, args.bigendian, False, False, None)
+        model_instance = model_class(dir_base_model, ftype, fname_out, args.bigendian, False, False, None)
         logger.info("Set model parameters")
         model_instance.set_gguf_parameters()
 
@@ -140,16 +127,18 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
     # overwrite method
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
+        # TODO: This will not take into account tensor transformations
         return [(name, data_torch)]
 
     # overwrite method
     def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
         del name, new_name, bid, n_dims  # unused
-        return True
+        return ftype != gguf.LlamaFileType.ALL_F32
 
     model_instance.get_tensors = types.MethodType(get_tensors, model_instance)
     model_instance.modify_tensors = types.MethodType(modify_tensors, model_instance)
     model_instance.extra_f16_tensors = types.MethodType(extra_f16_tensors, model_instance)
+
     model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
     logger.info("Exporting model...")
     model_instance.write()

From d52455f2bec45d7e6df8da5b26b91d969ce4580d Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 8 Jul 2024 22:00:13 +0200
Subject: [PATCH 15/33] add requirements

---
 requirements/requirements-convert_lora_to_gguf.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 requirements/requirements-convert_lora_to_gguf.txt

diff --git a/requirements/requirements-convert_lora_to_gguf.txt b/requirements/requirements-convert_lora_to_gguf.txt
new file mode 100644
index 0000000000000..5758076c41dc1
--- /dev/null
+++ b/requirements/requirements-convert_lora_to_gguf.txt
@@ -0,0 +1,2 @@
+-r ./requirements-convert_hf_to_gguf.txt
+--extra-index-url https://download.pytorch.org/whl/cpu

From 802565ca4327c3dbc02b83ad25ecd4b2bd8253b7 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 8 Jul 2024 22:01:23 +0200
Subject: [PATCH 16/33] fix requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 52456c2e6fd24..9e190ae27de38 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,4 @@
 -r ./requirements/requirements-convert_hf_to_gguf.txt
 -r ./requirements/requirements-convert_hf_to_gguf_update.txt
 -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
+-r ./requirements/requirements-convert_lora_to_gguf.txt

From 95b3eb057b0261a48aeadcb1524a1f58d7ef39cc Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 8 Jul 2024 22:05:35 +0200
Subject: [PATCH 17/33] fix outfile

---
 convert_lora_to_gguf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 861ab1e97f536..76c673101a46f 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -31,7 +31,7 @@ def parse_args() -> argparse.Namespace:
         description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
     parser.add_argument(
         "--outfile", type=Path,
-        help="path to write to; default: based on input.",
+        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
     )
     parser.add_argument(
         "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
@@ -77,7 +77,7 @@ def parse_args() -> argparse.Namespace:
         fname_out = args.outfile
     else:
         # output in the same directory as the model by default
-        fname_out = dir_lora / 'ggml-lora.gguf'
+        fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
 
     if os.path.exists(input_model):
         lora_model = torch.load(input_model, map_location="cpu")

From ee2b35c65f7e4e862990d6460c4b0a0ac433a874 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Wed, 10 Jul 2024 00:23:07 +0200
Subject: [PATCH 18/33] conversion: only allow selected models

---
 convert_hf_to_gguf.py   | 11 ++++++++--
 convert_lora_to_gguf.py | 45 +++++++++++++++++++++++++----------------
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 6ee41d3a118e5..109135b6821aa 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -373,6 +373,9 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
         except KeyError:
             raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
 
+    def support_lora(self) -> bool:
+        return False
+
     # used for GPT-2 BPE and WordPiece vocabs
     def get_vocab_base(self) -> tuple[list[str], list[int], str]:
         tokens: list[str] = []
@@ -1416,9 +1419,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         n_head = self.hparams["num_attention_heads"]
         n_kv_head = self.hparams.get("num_key_value_heads")
 
-        if name.endswith(("q_proj.weight", "q_proj.bias")):
+        if name.endswith(("q_proj.weight", "q_proj.bias", "q_proj.lora_B.weight")):
             data_torch = LlamaModel.permute(data_torch, n_head, n_head)
-        if name.endswith(("k_proj.weight", "k_proj.bias")):
+        if name.endswith(("k_proj.weight", "k_proj.bias", "k_proj.lora_B.weight")):
             data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 
         # process the experts separately
@@ -1466,6 +1469,10 @@ def write_tensors(self):
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
 
+    def support_lora(self) -> bool:
+        # TODO: support lora conversion for MOE
+        return "num_local_experts" not in self.hparams
+
 
 @Model.register("BitnetForCausalLM")
 class BitnetModel(Model):
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 76c673101a46f..c1ae1e9658788 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -9,7 +9,7 @@
 import sys
 import types
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, Iterator
+from typing import TYPE_CHECKING, Iterator
 
 import torch
 
@@ -26,6 +26,13 @@
 logger = logging.getLogger("lora-to-gguf")
 
 
+def get_base_tensor_name(lora_tensor_name: str) -> str:
+    base_name = lora_tensor_name.replace("base_model.model.", "")
+    base_name = base_name.replace(".lora_A.weight", ".weight")
+    base_name = base_name.replace(".lora_B.weight", ".weight")
+    return base_name
+
+
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
@@ -103,43 +110,47 @@ def parse_args() -> argparse.Namespace:
 
         # adapter_config = json.load(input_json)
         model_instance.gguf_writer.add_string("training.type", "finetune_lora")
+        if not model_instance.support_lora():
+            logger.error("LoRA conversion is not yet supported for this model")
+            sys.exit(1)
 
-    map_tensors: dict[str, Tensor] = {}
+    # map original name to gguf name
+    map_name: dict[str, str] = {}
     for tensor_name, tensor in lora_model.items():
-        orig_name = tensor_name.replace("base_model.model.", "")
-        orig_name = orig_name.replace(".lora_A.weight", ".weight")
-        orig_name = orig_name.replace(".lora_B.weight", ".weight")
+        base_name = get_base_tensor_name(tensor_name)
         is_lora_a = ".lora_A.weight" in tensor_name
         is_lora_b = ".lora_B.weight" in tensor_name
         if not is_lora_a and not is_lora_b:
             logger.error(f"Unexpected name '{tensor_name}': Not a lora_A or lora_B tensor")
             sys.exit(1)
-        dest_name = model_instance.map_tensor_name(orig_name)
+        dest_name = model_instance.map_tensor_name(base_name)
         dest_name = f"{dest_name}.lora_a" if is_lora_a else f"{dest_name}.lora_b"
-        # logger.info(f"{orig_name} --> {dest_name}")
-        map_tensors[dest_name] = tensor
+        map_name[tensor_name] = dest_name
 
     # overwrite method
-    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
-        for name, tensor in map_tensors.items():
-            yield (name, tensor)
+    def map_tensor_name(self, name: str) -> Iterator[tuple[str, Tensor]]:
+        return map_name[name]
 
     # overwrite method
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-        # TODO: This will not take into account tensor transformations
-        return [(name, data_torch)]
+    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+        for name, tensor in lora_model.items():
+            yield (name, tensor)
 
     # overwrite method
     def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
         del name, new_name, bid, n_dims  # unused
         return ftype != gguf.LlamaFileType.ALL_F32
 
+    model_instance._map_tensor_name = model_instance.map_tensor_name
+    model_instance.map_tensor_name = types.MethodType(map_tensor_name, model_instance)
+
+    model_instance._get_tensors = model_instance.get_tensors
     model_instance.get_tensors = types.MethodType(get_tensors, model_instance)
-    model_instance.modify_tensors = types.MethodType(modify_tensors, model_instance)
+
+    model_instance._extra_f16_tensors = model_instance.extra_f16_tensors
     model_instance.extra_f16_tensors = types.MethodType(extra_f16_tensors, model_instance)
 
     model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
     logger.info("Exporting model...")
     model_instance.write()
-    logger.info(f"Model successfully exported to {fname_out}")
+    logger.info(f"Model successfully exported to {model_instance.fname_out}")

From 713665db2ef05770dc3eb72b277034b2325758b0 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Wed, 10 Jul 2024 00:36:52 +0200
Subject: [PATCH 19/33] fix types

---
 convert_lora_to_gguf.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index c1ae1e9658788..c7393ac3aceb2 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -128,7 +128,7 @@ def parse_args() -> argparse.Namespace:
         map_name[tensor_name] = dest_name
 
     # overwrite method
-    def map_tensor_name(self, name: str) -> Iterator[tuple[str, Tensor]]:
+    def map_tensor_name(self, name: str) -> str:
         return map_name[name]
 
     # overwrite method
@@ -141,13 +141,13 @@ def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: i
         del name, new_name, bid, n_dims  # unused
         return ftype != gguf.LlamaFileType.ALL_F32
 
-    model_instance._map_tensor_name = model_instance.map_tensor_name
+    model_instance._map_tensor_name = model_instance.map_tensor_name # type: ignore
     model_instance.map_tensor_name = types.MethodType(map_tensor_name, model_instance)
 
-    model_instance._get_tensors = model_instance.get_tensors
+    model_instance._get_tensors = model_instance.get_tensors # type: ignore
     model_instance.get_tensors = types.MethodType(get_tensors, model_instance)
 
-    model_instance._extra_f16_tensors = model_instance.extra_f16_tensors
+    model_instance._extra_f16_tensors = model_instance.extra_f16_tensors # type: ignore
     model_instance.extra_f16_tensors = types.MethodType(extra_f16_tensors, model_instance)
 
     model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)

From f15167a4c7532101aa61e3e093a92801bf0d3ead Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 10 Jul 2024 02:21:38 +0200
Subject: [PATCH 20/33] cuda : do not use dmmv if the tensor does not have
 enough cols

---
 ggml/src/ggml-cuda.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 1c9ccc8a15e54..dfd75e0e7090b 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1875,7 +1875,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
 
     bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
+        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
+        && src1->ne[1] == 1;
     bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;

From 9841fbda7ceed5226283d7ad254b0d8f72305145 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 10 Jul 2024 02:21:53 +0200
Subject: [PATCH 21/33] llama : lora fixes

---
 src/llama.cpp | 83 ++++++++++++++++++++++-----------------------------
 1 file changed, 35 insertions(+), 48 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 278c7912d4752..fda48e822dc48 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2831,7 +2831,7 @@ struct llama_context {
 struct llama_lora_weight {
     struct ggml_tensor * a = nullptr;
     struct ggml_tensor * b = nullptr;
-    llama_lora_weight() {}
+    llama_lora_weight() = default;
     llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
 };
 
@@ -18519,13 +18519,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 }
 
 static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
-    static const int n_inp_tensors = 5; // see llama_model
-    static const int n_out_tensors = 5; // see llama_model
     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);
 
     ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
-        /* .no_alloc = */ false,
+        /* .no_alloc = */ true,
         /* .ctx      = */ &ctx,
     };
     struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
@@ -18536,7 +18534,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
     // check metadata
     {
         auto get_kv_str = [&](std::string key) -> std::string {
-            std::vector<char> str_buf(32, 0); // we only get the arch, so no need big buffer here
             int id = gguf_find_key(ctx_gguf, key.c_str());
             return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
         };
@@ -18544,50 +18541,36 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
         auto lora_arch = llm_arch_from_string(lora_arch_name);
         if (lora_arch != model->arch) {
+            gguf_free(ctx_gguf);
             throw std::runtime_error("model arch and LoRA arch mismatch");
         }
+
         auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE));
         if (train_type != "finetune_lora") {
+            gguf_free(ctx_gguf);
             throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type);
         }
     }
 
-    // calculate n_tensors_per_layer
-    int n_tensors_per_layer = 0;
-    {
-        int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
-        for (int i = 0; i < n_tensors; i++) {
-            int il = -1;
-            sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il);
-            if (il == 0) n_tensors_per_layer++;
-        }
-    }
-
-    // count layer buffer types
-    std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
-    for (int64_t i = 0; i < model->hparams.n_layer; i++) {
-        buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer;
-    }
-    buft_tensor_count[model->buft_input.buft]  += n_inp_tensors;
-    buft_tensor_count[model->buft_output.buft] += n_out_tensors;
+    int n_tensors = gguf_get_n_tensors(ctx_gguf);
 
-    // allocate contexts
+    // contexts for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    {
-        auto new_ggml_ctx = [](size_t n_tensors) {
+    auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            // add a new context
             struct ggml_init_params params = {
                 /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
                 /*.mem_buffer =*/ NULL,
                 /*.no_alloc   =*/ true,
             };
-            return ggml_init(params);
+            ggml_context * buft_ctx = ggml_init(params);
+            ctx_map[buft] = buft_ctx;
+            return buft_ctx;
         };
-        for (auto & it : buft_tensor_count) {
-            int n_tensors = it.second;
-            // LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second);
-            ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors
-        }
-    }
+        return it->second;
+    };
 
     // bundle lora_a and lora_b into pairs
     std::map<std::string, llama_lora_weight> ab_map;
@@ -18611,33 +18594,40 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
                 ab_map[name].b = cur;
             }
         } else {
-            // maybe "optimizer.*"" tensors
-            LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name);
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
         }
     }
 
     // add tensors
     for (auto & it : ab_map) {
-        std::string name = it.first;
-        const char * cname = name.c_str();
+        const std::string & name = it.first;
         llama_lora_weight & w = it.second;
-        GGML_ASSERT(w.a != nullptr);
-        GGML_ASSERT(w.b != nullptr);
-        int il = -1;
-        sscanf(cname, "blk.%d.", &il);
+
+        if (!w.a || !w.b) {
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
+        }
+
         // device buft and device ctx
-        auto model_tensor = llama_get_model_tensor(model, cname);
+        auto * model_tensor = llama_get_model_tensor(model, name.c_str());
         if (!model_tensor) {
             gguf_free(ctx_gguf);
             ggml_free(ctx);
             throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
         }
-        struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer));
+        struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
         // validate tensor shape
         if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
             throw std::runtime_error("tensor '" + name + "' has incorrect shape");
         }
         if (w.a->ne[1] != w.b->ne[0]) {
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
             throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
         }
         // save tensor to adapter
@@ -18661,7 +18651,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
                 ggml_free(ctx);
                 throw std::runtime_error("failed to allocate buffer for lora adapter\n");
             }
-            ggml_backend_buffer_clear(buf, 0);
+            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
             adapter.ctxs.push_back(ctx_dev);
             adapter.bufs.push_back(buf);
         }
@@ -18674,12 +18664,9 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
             size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
             size_t size = ggml_nbytes(orig);
-            if (read_buf.size() < size) {
-                read_buf.resize(size);
-            }
+            read_buf.resize(size);
             gguf_file.seek(offs, SEEK_SET);
             gguf_file.read_raw(read_buf.data(), size);
-            // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size);
             ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
         };
         for (auto & it : adapter.ab_map) {

From 1faf7e5be6b339d0ac2f3b6615200627f18aa8dc Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Wed, 10 Jul 2024 19:51:34 +0200
Subject: [PATCH 22/33] do not disable mmap with lora

Co-authored-by: slaren <slarengh@gmail.com>
---
 common/common.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 9d9980cf18169..ddb1e79ae6ae3 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -681,7 +681,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     if (arg == "--lora") {
         CHECK_ARG
         params.lora_adapter.emplace_back(argv[i], 1.0f);
-        params.use_mmap = false;
         return true;
     }
     if (arg == "--lora-scaled") {
@@ -689,7 +688,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         const char* lora_adapter = argv[i];
         CHECK_ARG
         params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
-        params.use_mmap = false;
         return true;
     }
     if (arg == "--lora-base") {

From 916e95928b0757fb9e5e601ee5f325af29e5253e Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Thu, 11 Jul 2024 00:30:07 +0200
Subject: [PATCH 23/33] llm_build_lora_mm_id

---
 src/llama.cpp | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 7ed80fcafcc27..30ecbb801069d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7882,7 +7882,6 @@ static struct ggml_tensor * llm_build_lora_mm(
         if (lora == nullptr) {
             continue;
         }
-        // TODO: check if lora_a need transpose
         struct ggml_tensor * ab_cur = ggml_mul_mat(
             ctx0, lora->b,
             ggml_mul_mat(ctx0, lora->a, cur)
@@ -7893,6 +7892,31 @@ static struct ggml_tensor * llm_build_lora_mm(
     return res;
 }
 
+// do mat_mul_id, while optionally apply lora
+static struct ggml_tensor * llm_build_lora_mm_id(
+        struct llama_context & lctx,
+         struct ggml_context * ctx0,
+          struct ggml_tensor * w,   // struct ggml_tensor * as
+          struct ggml_tensor * cur, // struct ggml_tensor * b
+          struct ggml_tensor * ids) {
+    struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
+    for (auto & it : lctx.lora_adapters) {
+        struct llama_lora_weight * lora = it.first->get_weight(w);
+        float scale = it.second;
+        if (lora == nullptr) {
+            continue;
+        }
+        struct ggml_tensor * ab_cur = ggml_mul_mat_id(
+            ctx0, lora->b,
+            ggml_mul_mat_id(ctx0, lora->a, cur, ids),
+            ids
+        );
+        ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale);
+        res = ggml_add(ctx0, res, ab_cur);
+    }
+    return res;
+}
+
 static struct ggml_tensor * llm_build_norm(
         struct ggml_context * ctx,
          struct ggml_tensor * cur,
@@ -8103,10 +8127,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
     }
 
     cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
-    ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+    ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
     cb(up, "ffn_moe_up", il);
 
-    ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+    ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
     cb(gate, "ffn_moe_gate", il);
 
     switch (type_op) {
@@ -8127,7 +8151,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
     ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
     cb(par, "ffn_moe_gate_par", il);
 
-    ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
+    ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
     cb(experts, "ffn_moe_down", il);
 
     experts = ggml_mul(ctx, experts, weights);

From 9d96328bdf81b2f39ed356ee0b78afa87963be10 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Tue, 9 Jul 2024 18:26:38 -0400
Subject: [PATCH 24/33] convert_lora : MoE LoRA conversion support

* convert_lora : prefer safetensors, similarly to convert_hf
---
 convert_hf_to_gguf.py   |  11 +-
 convert_lora_to_gguf.py | 262 ++++++++++++++++++++++++++++++++--------
 2 files changed, 216 insertions(+), 57 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 61f8e370c30fd..ebb5ca376133b 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -373,9 +373,6 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
         except KeyError:
             raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
 
-    def support_lora(self) -> bool:
-        return False
-
     # used for GPT-2 BPE and WordPiece vocabs
     def get_vocab_base(self) -> tuple[list[str], list[int], str]:
         tokens: list[str] = []
@@ -1415,9 +1412,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         n_head = self.hparams["num_attention_heads"]
         n_kv_head = self.hparams.get("num_key_value_heads")
 
-        if name.endswith(("q_proj.weight", "q_proj.bias", "q_proj.lora_B.weight")):
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
             data_torch = LlamaModel.permute(data_torch, n_head, n_head)
-        if name.endswith(("k_proj.weight", "k_proj.bias", "k_proj.lora_B.weight")):
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
             data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 
         # process the experts separately
@@ -1465,10 +1462,6 @@ def write_tensors(self):
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
 
-    def support_lora(self) -> bool:
-        # TODO: support lora conversion for MOE
-        return "num_local_experts" not in self.hparams
-
 
 @Model.register("BitnetForCausalLM")
 class BitnetModel(Model):
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index c7393ac3aceb2..2d01fdc466f9c 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -3,13 +3,14 @@
 
 from __future__ import annotations
 
+from dataclasses import dataclass
 import logging
 import argparse
 import os
 import sys
-import types
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterator
+from types import EllipsisType
+from typing import TYPE_CHECKING, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
 
 import torch
 
@@ -26,6 +27,169 @@
 logger = logging.getLogger("lora-to-gguf")
 
 
+@dataclass
+class PartialLoraTensor:
+    A: Tensor | None = None
+    B: Tensor | None = None
+
+
+# magic to support tensor shape modifications and splitting
+class LoraTorchTensor:
+    _lora_A: Tensor
+    _lora_B: Tensor
+    _rank: int
+
+    def __init__(self, A: Tensor, B: Tensor):
+        assert len(A.shape) == len(B.shape)
+        if A.dtype != B.dtype:
+            A = A.to(torch.float32)
+            B = B.to(torch.float32)
+        self._lora_A = A
+        self._lora_B = B
+        assert self._lora_A.shape[-2] == self._lora_B.shape[-1]
+        self._rank = self._lora_B.shape[-1]
+
+    def __getitem__(
+        self,
+        indices: (
+            SupportsIndex
+            | slice
+            | tuple[SupportsIndex | slice | EllipsisType | Tensor, ...]
+        ),
+    ) -> LoraTorchTensor:
+        shape = self.shape
+        if isinstance(indices, (SupportsIndex, slice)):
+            if len(shape) > 2:
+                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
+            else:
+                raise NotImplementedError
+        elif isinstance(indices, tuple):
+            assert len(indices) > 0
+            if isinstance(indices[-1], EllipsisType):
+                return self[indices[:-1]]
+            # expand ellipsis
+            indices = tuple(
+                u
+                for v in (
+                    (
+                        (slice(None, None) for _ in range(len(indices) - 1))
+                        if isinstance(i, EllipsisType)
+                        else (i,)
+                    )
+                    for i in indices
+                )
+                for u in v
+            )
+
+            if len(indices) < len(shape):
+                indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
+
+            # TODO: make sure this is correct
+            # lora_A has a shape which looks like (..., 1, 1, rank, self.shape[-1])
+            indices_A = (
+                *(
+                    0 if isinstance(i, SupportsIndex) else slice(None, None)
+                    for i in indices[:-2]
+                ),
+                slice(None, None),
+                indices[-1],
+            )
+            indices_B = indices[:-1]
+            return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
+        else:
+            raise NotImplementedError
+
+    @property
+    def dtype(self) -> torch.dtype:
+        assert self._lora_A.dtype == self._lora_B.dtype
+        return self._lora_A.dtype
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
+
+    def size(self, dim=None):
+        assert dim is None
+        return self.shape
+
+    def reshape(self, *shape: int | tuple[int]) -> LoraTorchTensor:
+        if isinstance(shape[0], tuple):
+            new_shape: tuple[int] = shape[0]
+        else:
+            new_shape = cast(tuple[int], shape)
+        orig_shape = self.shape
+        if new_shape[-1] != orig_shape[-1]:
+            raise NotImplementedError
+        return LoraTorchTensor(
+            self._lora_A.reshape((*(1 for _ in new_shape[:-2]), *self._lora_A.shape[-2:])),
+            self._lora_B.reshape((*new_shape[:-1], self._rank)),
+        )
+
+    def reshape_as(self, other: Tensor) -> LoraTorchTensor:
+        return self.reshape(*other.shape)
+
+    def view(self, *size: int) -> LoraTorchTensor:
+        return self.reshape(*size)
+
+    def permute(self, *dims: int) -> LoraTorchTensor:
+        shape = self.shape
+        dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
+        if dims[-1] == -2 and dims[-2] == -1:
+            return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
+        else:
+            assert dims[-1] == -1
+            assert all(dim == 1 for dim in self._lora_A.shape[:-2])
+            return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
+
+    def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
+        shape = self.shape
+        dims = [i for i in range(len(shape))]
+        dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
+        return self.permute(*dims)
+
+    def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
+        return self.transpose(axis0, axis1)
+
+    def to(self, *args, **kwargs):
+        return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
+
+    @classmethod
+    def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
+        del types  # unused
+
+        if kwargs is None:
+            kwargs = {}
+
+        if func is torch.permute:
+            return type(args[0]).permute(*args, **kwargs)
+        elif func is torch.reshape:
+            return type(args[0]).reshape(*args, **kwargs)
+        elif func is torch.stack:
+            assert isinstance(args[0], Sequence)
+            dim = kwargs.get("dim", 0)
+            assert dim == 0
+            return LoraTorchTensor(
+                torch.stack([a._lora_A for a in args[0]], dim),
+                torch.stack([b._lora_B for b in args[0]], dim),
+            )
+        elif func is torch.cat:
+            assert isinstance(args[0], Sequence)
+            dim = kwargs.get("dim", 0)
+            assert dim == 0
+            if len(args[0][0].shape) > 2:
+                return LoraTorchTensor(
+                    torch.cat([a._lora_A for a in args[0]], dim),
+                    torch.cat([b._lora_B for b in args[0]], dim),
+                )
+            else:
+                return LoraTorchTensor(
+                    args[0][0]._lora_A,  # TODO: is this correct? (can't cat over the rank)
+                    torch.cat([b._lora_B for b in args[0]], dim),
+                )
+        else:
+            raise NotImplementedError
+
+
 def get_base_tensor_name(lora_tensor_name: str) -> str:
     base_name = lora_tensor_name.replace("base_model.model.", "")
     base_name = base_name.replace(".lora_A.weight", ".weight")
@@ -79,7 +243,7 @@ def parse_args() -> argparse.Namespace:
     dir_base_model = args.base
     dir_lora = args.lora_path
     input_json = os.path.join(dir_lora, "adapter_config.json")
-    input_model = os.path.join(dir_lora, "adapter_model.bin")
+    input_model = os.path.join(dir_lora, "adapter_model.safetensors")
     if args.outfile is not None:
         fname_out = args.outfile
     else:
@@ -87,12 +251,13 @@ def parse_args() -> argparse.Namespace:
         fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
 
     if os.path.exists(input_model):
-        lora_model = torch.load(input_model, map_location="cpu")
-    else:
-        input_model = os.path.join(dir_lora, "adapter_model.safetensors")
         # lazy import load_file only if lora is in safetensors format.
         from safetensors.torch import load_file
+
         lora_model = load_file(input_model, device="cpu")
+    else:
+        input_model = os.path.join(dir_lora, "adapter_model.bin")
+        lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
 
     # load base model
     logger.info(f"Loading base model: {dir_base_model.name}")
@@ -104,53 +269,54 @@ def parse_args() -> argparse.Namespace:
             logger.error(f"Model {hparams['architectures'][0]} is not supported")
             sys.exit(1)
 
-        model_instance = model_class(dir_base_model, ftype, fname_out, args.bigendian, False, False, None)
-        logger.info("Set model parameters")
-        model_instance.set_gguf_parameters()
+        class LoraModel(model_class):
+            model_arch = model_class.model_arch
 
-        # adapter_config = json.load(input_json)
-        model_instance.gguf_writer.add_string("training.type", "finetune_lora")
-        if not model_instance.support_lora():
-            logger.error("LoRA conversion is not yet supported for this model")
-            sys.exit(1)
-
-    # map original name to gguf name
-    map_name: dict[str, str] = {}
-    for tensor_name, tensor in lora_model.items():
-        base_name = get_base_tensor_name(tensor_name)
-        is_lora_a = ".lora_A.weight" in tensor_name
-        is_lora_b = ".lora_B.weight" in tensor_name
-        if not is_lora_a and not is_lora_b:
-            logger.error(f"Unexpected name '{tensor_name}': Not a lora_A or lora_B tensor")
-            sys.exit(1)
-        dest_name = model_instance.map_tensor_name(base_name)
-        dest_name = f"{dest_name}.lora_a" if is_lora_a else f"{dest_name}.lora_b"
-        map_name[tensor_name] = dest_name
+            def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+                tensor_map: dict[str, PartialLoraTensor] = {}
 
-    # overwrite method
-    def map_tensor_name(self, name: str) -> str:
-        return map_name[name]
+                for name, tensor in lora_model.items():
+                    base_name = get_base_tensor_name(name)
+                    is_lora_a = ".lora_A.weight" in name
+                    is_lora_b = ".lora_B.weight" in name
+                    if not is_lora_a and not is_lora_b:
+                        if ".base_layer.weight" in name:
+                            continue
+                        logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
+                        sys.exit(1)
 
-    # overwrite method
-    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
-        for name, tensor in lora_model.items():
-            yield (name, tensor)
+                    if base_name in tensor_map:
+                        if is_lora_a:
+                            tensor_map[base_name].A = tensor
+                        else:
+                            tensor_map[base_name].B = tensor
+                    else:
+                        if is_lora_a:
+                            tensor_map[base_name] = PartialLoraTensor(A=tensor)
+                        else:
+                            tensor_map[base_name] = PartialLoraTensor(B=tensor)
 
-    # overwrite method
-    def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
-        del name, new_name, bid, n_dims  # unused
-        return ftype != gguf.LlamaFileType.ALL_F32
+                for name, tensor in tensor_map.items():
+                    assert tensor.A is not None
+                    assert tensor.B is not None
+                    yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
 
-    model_instance._map_tensor_name = model_instance.map_tensor_name # type: ignore
-    model_instance.map_tensor_name = types.MethodType(map_tensor_name, model_instance)
+            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+                dest = super().modify_tensors(data_torch, name, bid)
+                for dest_name, dest_data in dest:
+                    assert isinstance(dest_data, LoraTorchTensor)
+                    # logger.info(f"{orig_name} --> {dest_name}")
+                    yield (dest_name + ".lora_a", dest_data._lora_A)
+                    yield (dest_name + ".lora_b", dest_data._lora_B)
 
-    model_instance._get_tensors = model_instance.get_tensors # type: ignore
-    model_instance.get_tensors = types.MethodType(get_tensors, model_instance)
+        model_instance = LoraModel(dir_base_model, ftype, fname_out, args.bigendian, False, False, None)
+        logger.info("Set model parameters")
+        model_instance.set_gguf_parameters()
 
-    model_instance._extra_f16_tensors = model_instance.extra_f16_tensors # type: ignore
-    model_instance.extra_f16_tensors = types.MethodType(extra_f16_tensors, model_instance)
+        # adapter_config = json.load(input_json)
+        model_instance.gguf_writer.add_string("training.type", "finetune_lora")
 
-    model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
-    logger.info("Exporting model...")
-    model_instance.write()
-    logger.info(f"Model successfully exported to {model_instance.fname_out}")
+        model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
+        logger.info("Exporting model...")
+        model_instance.write()
+        logger.info(f"Model successfully exported to {model_instance.fname_out}")

From 8956543c091c6851089ed7467fa44ac2b2b0ee37 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 15 Jul 2024 02:35:06 -0400
Subject: [PATCH 25/33] convert_hf : simplify modify_tensors for InternLM2

* convert_lora : lazy conversion

* llama : load and use alpha from LoRA adapters
---
 convert_hf_to_gguf.py   |  33 ++++------
 convert_lora_to_gguf.py | 134 ++++++++++++++++++++++++++++------------
 gguf-py/gguf/quants.py  |   2 +-
 src/llama.cpp           |  22 +++++--
 4 files changed, 124 insertions(+), 67 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ebb5ca376133b..70ea963f2e879 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2222,13 +2222,6 @@ def set_vocab(self):
 
         special_vocab.add_to_gguf(self.gguf_writer)
 
-    def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
-        if n_head_kv is not None and n_head != n_head_kv:
-            n_head = n_head_kv
-        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-                .swapaxes(1, 2)
-                .reshape(weights.shape))
-
     def set_gguf_parameters(self):
         self.gguf_writer.add_name("InternLM2")
         self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
@@ -2248,26 +2241,22 @@ def set_gguf_parameters(self):
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         num_heads = self.hparams["num_attention_heads"]
         num_kv_heads = self.hparams["num_key_value_heads"]
-        hidden_size = self.hparams["hidden_size"]
+        n_embd = self.hparams["hidden_size"]
         q_per_kv = num_heads // num_kv_heads
-        head_dim = hidden_size // num_heads
+        head_dim = n_embd // num_heads
         num_groups = num_heads // q_per_kv
 
-        qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
-
-        if re.match(qkv_pattern, name):
-            bid = re.findall(qkv_pattern, name)[0]
+        if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
             qkv = data_torch
-            # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
-            qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim))
-            q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
+
+            qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
+            q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
+
             # The model weights of q and k equire additional reshape.
-            # q = self._hf_permute_qk(rearrange(q, " o g n i ->  o (g n i)").T, num_heads, num_heads)
-            q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads)
-            # k = self._hf_permute_qk(rearrange(k, " o g n i ->  o (g n i)").T, num_heads, num_kv_heads)
-            k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads)
-            # v = rearrange(v, " o g n i ->  o (g n i)").T
-            v = v.reshape((v.shape[0], -1)).T
+            q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
+            k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
+            v = v.reshape((-1, v.shape[-1]))
+
             return [
                 (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
                 (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 2d01fdc466f9c..71d3e57f55720 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -8,9 +8,10 @@
 import argparse
 import os
 import sys
+import json
+from math import prod
 from pathlib import Path
-from types import EllipsisType
-from typing import TYPE_CHECKING, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
+from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
 
 import torch
 
@@ -22,7 +23,7 @@
 import gguf
 
 # reuse model definitions from convert_hf_to_gguf.py
-from convert_hf_to_gguf import Model
+from convert_hf_to_gguf import LazyTorchTensor, Model
 
 logger = logging.getLogger("lora-to-gguf")
 
@@ -35,37 +36,45 @@ class PartialLoraTensor:
 
 # magic to support tensor shape modifications and splitting
 class LoraTorchTensor:
-    _lora_A: Tensor
-    _lora_B: Tensor
+    _lora_A: Tensor  # (n_rank, row_size)
+    _lora_B: Tensor  # (col_size, n_rank)
     _rank: int
 
     def __init__(self, A: Tensor, B: Tensor):
         assert len(A.shape) == len(B.shape)
+        assert A.shape[-2] == B.shape[-1]
         if A.dtype != B.dtype:
             A = A.to(torch.float32)
             B = B.to(torch.float32)
         self._lora_A = A
         self._lora_B = B
-        assert self._lora_A.shape[-2] == self._lora_B.shape[-1]
-        self._rank = self._lora_B.shape[-1]
+        self._rank = B.shape[-1]
+
+    def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
+        return (self._lora_A, self._lora_B)
 
     def __getitem__(
         self,
         indices: (
             SupportsIndex
             | slice
-            | tuple[SupportsIndex | slice | EllipsisType | Tensor, ...]
+            | tuple[SupportsIndex | slice | Tensor, ...]  # TODO: add ellipsis in the type signature
         ),
     ) -> LoraTorchTensor:
         shape = self.shape
-        if isinstance(indices, (SupportsIndex, slice)):
+        if isinstance(indices, SupportsIndex):
             if len(shape) > 2:
                 return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
             else:
-                raise NotImplementedError
+                raise NotImplementedError  # can't return a vector
+        elif isinstance(indices, slice):
+            if len(shape) > 2:
+                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
+            else:
+                return LoraTorchTensor(self._lora_A, self._lora_B[indices])
         elif isinstance(indices, tuple):
             assert len(indices) > 0
-            if isinstance(indices[-1], EllipsisType):
+            if indices[-1] is Ellipsis:
                 return self[indices[:-1]]
             # expand ellipsis
             indices = tuple(
@@ -73,7 +82,7 @@ def __getitem__(
                 for v in (
                     (
                         (slice(None, None) for _ in range(len(indices) - 1))
-                        if isinstance(i, EllipsisType)
+                        if i is Ellipsis
                         else (i,)
                     )
                     for i in indices
@@ -85,11 +94,14 @@ def __getitem__(
                 indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
 
             # TODO: make sure this is correct
-            # lora_A has a shape which looks like (..., 1, 1, rank, self.shape[-1])
             indices_A = (
                 *(
-                    0 if isinstance(i, SupportsIndex) else slice(None, None)
-                    for i in indices[:-2]
+                    (
+                        j.__index__() % self._lora_A.shape[i]
+                        if isinstance(j, SupportsIndex)
+                        else slice(None, None)
+                    )
+                    for i, j in enumerate(indices[:-2])
                 ),
                 slice(None, None),
                 indices[-1],
@@ -97,7 +109,7 @@ def __getitem__(
             indices_B = indices[:-1]
             return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
         else:
-            raise NotImplementedError
+            raise NotImplementedError  # unknown indice type
 
     @property
     def dtype(self) -> torch.dtype:
@@ -106,23 +118,37 @@ def dtype(self) -> torch.dtype:
 
     @property
     def shape(self) -> tuple[int, ...]:
+        assert len(self._lora_A.shape) == len(self._lora_B.shape)
         return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
 
     def size(self, dim=None):
         assert dim is None
         return self.shape
 
-    def reshape(self, *shape: int | tuple[int]) -> LoraTorchTensor:
+    def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
         if isinstance(shape[0], tuple):
-            new_shape: tuple[int] = shape[0]
+            new_shape: tuple[int, ...] = shape[0]
         else:
-            new_shape = cast(tuple[int], shape)
+            new_shape = cast(tuple[int, ...], shape)
         orig_shape = self.shape
+        if len(new_shape) < 2:
+            raise NotImplementedError  # can't become a vector
+
+        # expand -1 in the shape
+        if any(dim == -1 for dim in new_shape):
+            n_elems = prod(orig_shape)
+            n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
+            assert n_elems % n_new_elems == 0
+            new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
+
         if new_shape[-1] != orig_shape[-1]:
-            raise NotImplementedError
+            raise NotImplementedError  # can't reshape the row size trivially
+
+        shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
+        shape_B = (*new_shape[:-1], self._rank)
         return LoraTorchTensor(
-            self._lora_A.reshape((*(1 for _ in new_shape[:-2]), *self._lora_A.shape[-2:])),
-            self._lora_B.reshape((*new_shape[:-1], self._rank)),
+            self._lora_A.reshape(shape_A),
+            self._lora_B.reshape(shape_B),
         )
 
     def reshape_as(self, other: Tensor) -> LoraTorchTensor:
@@ -134,12 +160,15 @@ def view(self, *size: int) -> LoraTorchTensor:
     def permute(self, *dims: int) -> LoraTorchTensor:
         shape = self.shape
         dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
-        if dims[-1] == -2 and dims[-2] == -1:
-            return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
-        else:
-            assert dims[-1] == -1
+        if dims[-1] == -1:
+            # TODO: support higher dimensional A shapes bigger than 1
             assert all(dim == 1 for dim in self._lora_A.shape[:-2])
             return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
+        if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
+            return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
+        else:
+            # TODO: compose the above two
+            raise NotImplementedError
 
     def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
         shape = self.shape
@@ -181,11 +210,13 @@ def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
                     torch.cat([a._lora_A for a in args[0]], dim),
                     torch.cat([b._lora_B for b in args[0]], dim),
                 )
-            else:
+            elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
                 return LoraTorchTensor(
-                    args[0][0]._lora_A,  # TODO: is this correct? (can't cat over the rank)
+                    args[0][0]._lora_A,
                     torch.cat([b._lora_B for b in args[0]], dim),
                 )
+            else:
+                raise NotImplementedError
         else:
             raise NotImplementedError
 
@@ -205,13 +236,17 @@ def parse_args() -> argparse.Namespace:
         help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
     )
     parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
     )
     parser.add_argument(
         "--bigendian", action="store_true",
         help="model is executed on big endian machine",
     )
+    parser.add_argument(
+        "--no-lazy", action="store_true",
+        help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
+    )
     parser.add_argument(
         "--verbose", action="store_true",
         help="increase output verbosity",
@@ -237,13 +272,16 @@ def parse_args() -> argparse.Namespace:
         "f16": gguf.LlamaFileType.MOSTLY_F16,
         "bf16": gguf.LlamaFileType.MOSTLY_BF16,
         "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+        "auto": gguf.LlamaFileType.GUESSED,
     }
+
     ftype = ftype_map[args.outtype]
 
-    dir_base_model = args.base
-    dir_lora = args.lora_path
-    input_json = os.path.join(dir_lora, "adapter_config.json")
-    input_model = os.path.join(dir_lora, "adapter_model.safetensors")
+    dir_base_model: Path = args.base
+    dir_lora: Path = args.lora_path
+    lora_config = dir_lora / "adapter_config.json"
+    input_model = dir_lora / "adapter_model.safetensors"
+
     if args.outfile is not None:
         fname_out = args.outfile
     else:
@@ -276,6 +314,8 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                 tensor_map: dict[str, PartialLoraTensor] = {}
 
                 for name, tensor in lora_model.items():
+                    if self.lazy:
+                        tensor = LazyTorchTensor.from_eager(tensor)
                     base_name = get_base_tensor_name(name)
                     is_lora_a = ".lora_A.weight" in name
                     is_lora_b = ".lora_B.weight" in name
@@ -305,16 +345,30 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 dest = super().modify_tensors(data_torch, name, bid)
                 for dest_name, dest_data in dest:
                     assert isinstance(dest_data, LoraTorchTensor)
-                    # logger.info(f"{orig_name} --> {dest_name}")
-                    yield (dest_name + ".lora_a", dest_data._lora_A)
-                    yield (dest_name + ".lora_b", dest_data._lora_B)
-
-        model_instance = LoraModel(dir_base_model, ftype, fname_out, args.bigendian, False, False, None)
+                    lora_a, lora_b = dest_data.get_lora_A_B()
+
+                    yield (dest_name + ".lora_a", lora_a)
+                    yield (dest_name + ".lora_b", lora_b)
+
+        model_instance = LoraModel(
+            dir_base_model,
+            ftype,
+            fname_out,
+            is_big_endian=args.bigendian,
+            use_temp_file=False,
+            eager=args.no_lazy,
+            model_name=None,
+        )
         logger.info("Set model parameters")
         model_instance.set_gguf_parameters()
 
-        # adapter_config = json.load(input_json)
+        with open(lora_config, "r") as f:
+            lparams: dict[str, Any] = json.load(f)
+
+        alpha = lparams["lora_alpha"]
+
         model_instance.gguf_writer.add_string("training.type", "finetune_lora")
+        model_instance.gguf_writer.add_float32("training.lora.alpha", float(alpha))
 
         model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
         logger.info("Exporting model...")
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index b22eec1661ce7..16e0a9aaa8a8b 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -43,7 +43,7 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.
         osize *= dim
     out = np.empty(shape=osize, dtype=otype)
     # compute over groups of 16 rows (arbitrary, but seems good for performance)
-    n_groups = rows.shape[0] // 16
+    n_groups = (rows.shape[0] // 16) or 1
     np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
     return out.reshape(oshape)
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 30ecbb801069d..3906b9ea16f7b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -379,6 +379,7 @@ enum llm_kv {
     LLM_KV_TOKENIZER_EOT_ID,
 
     LLM_KV_TRAINING_TYPE,
+    LLM_KV_TRAINING_LORA_ALPHA,
 };
 
 static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -473,7 +474,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_MIDDLE_ID,            "tokenizer.ggml.middle_token_id"          },
     { LLM_KV_TOKENIZER_EOT_ID,               "tokenizer.ggml.eot_token_id"             },
 
-    { LLM_KV_TRAINING_TYPE,                  "training.type" },
+    { LLM_KV_TRAINING_TYPE,                  "training.type"       },
+    { LLM_KV_TRAINING_LORA_ALPHA,            "training.lora.alpha" },
 };
 
 struct LLM_KV {
@@ -2848,6 +2850,8 @@ struct llama_lora_adapter {
     std::vector<struct ggml_context *> ctxs;
     std::vector<ggml_backend_buffer_t> bufs;
 
+    float alpha;
+
     llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
         base_model->lora_adapters.insert(this);
     }
@@ -7878,10 +7882,12 @@ static struct ggml_tensor * llm_build_lora_mm(
     struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
     for (auto & it : lctx.lora_adapters) {
         struct llama_lora_weight * lora = it.first->get_weight(w);
-        float scale = it.second;
         if (lora == nullptr) {
             continue;
         }
+        const float alpha = it.first->alpha;
+        const float rank  = (float) lora->b->ne[0];
+        const float scale = alpha ? it.second * alpha / rank : it.second;
         struct ggml_tensor * ab_cur = ggml_mul_mat(
             ctx0, lora->b,
             ggml_mul_mat(ctx0, lora->a, cur)
@@ -7902,10 +7908,12 @@ static struct ggml_tensor * llm_build_lora_mm_id(
     struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
     for (auto & it : lctx.lora_adapters) {
         struct llama_lora_weight * lora = it.first->get_weight(w);
-        float scale = it.second;
         if (lora == nullptr) {
             continue;
         }
+        const float alpha = it.first->alpha;
+        const float rank  = (float) lora->b->ne[0];
+        const float scale = alpha ? it.second * alpha / rank : it.second;
         struct ggml_tensor * ab_cur = ggml_mul_mat_id(
             ctx0, lora->b,
             ggml_mul_mat_id(ctx0, lora->a, cur, ids),
@@ -18587,10 +18595,14 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
 
     // check metadata
     {
-        auto get_kv_str = [&](std::string key) -> std::string {
+        auto get_kv_str = [&](const std::string & key) -> std::string {
             int id = gguf_find_key(ctx_gguf, key.c_str());
             return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
         };
+        auto get_kv_f32 = [&](const std::string & key) -> float {
+            int id = gguf_find_key(ctx_gguf, key.c_str());
+            return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
+        };
         LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
         auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
         auto lora_arch = llm_arch_from_string(lora_arch_name);
@@ -18604,6 +18616,8 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
             gguf_free(ctx_gguf);
             throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type);
         }
+
+        adapter.alpha = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_ALPHA));
     }
 
     int n_tensors = gguf_get_n_tensors(ctx_gguf);

From 87301bdd59554604ce0103fe39580a1608cf97cd Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 15 Jul 2024 03:23:19 -0400
Subject: [PATCH 26/33] llama : use llm_build_lora_mm in most model graphs

---
 src/llama.cpp | 238 +++++++++++++++++++++++++-------------------------
 1 file changed, 119 insertions(+), 119 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 3906b9ea16f7b..5c7edf6b7f496 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8945,13 +8945,13 @@ struct llm_build_context {
 
             // self-attention
             {
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 switch (model.type) {
@@ -9024,7 +9024,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -9060,13 +9060,13 @@ struct llm_build_context {
 
             // self-attention
             {
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -9127,7 +9127,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -9176,7 +9176,7 @@ struct llm_build_context {
                     cur = attn_norm;
                 }
 
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
@@ -9247,7 +9247,7 @@ struct llm_build_context {
                 LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -9292,21 +9292,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -9398,7 +9398,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
 
         // Grok
         // multiply logits by output_multiplier_scale of 0.5773502691896257
@@ -9449,7 +9449,7 @@ struct llm_build_context {
                 struct ggml_tensor * Kcur = nullptr;
                 struct ggml_tensor * Vcur = nullptr;
 
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -9529,7 +9529,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
 
         cb(cur, "result_output", -1);
 
@@ -9571,7 +9571,7 @@ struct llm_build_context {
 
             // self-attention
             {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -9634,7 +9634,7 @@ struct llm_build_context {
                 LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -9666,13 +9666,13 @@ struct llm_build_context {
 
             // self-attention
             {
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
@@ -9728,7 +9728,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -9780,7 +9780,7 @@ struct llm_build_context {
 
             // self-attention
             if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
-                Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
+                Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq);
                 cb(Qcur, "Qcur", il);
 
                 if (model.layers[il].attn_q_norm) {
@@ -9790,7 +9790,7 @@ struct llm_build_context {
                             LLM_NORM, cb, il);
                 }
 
-                Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
+                Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk);
                 cb(Kcur, "Kcur", il);
 
                 if (model.layers[il].attn_k_norm) {
@@ -9799,14 +9799,14 @@ struct llm_build_context {
                             model.layers[il].attn_k_norm_b,
                             LLM_NORM, cb, il);
                 }
-                Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
+                Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
             } else {
                 // compute Q and K and RoPE them
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
@@ -9855,7 +9855,7 @@ struct llm_build_context {
 
             ggml_build_forward_expand(gf, cur);
 
-            cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
+            cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
             if (model.layers[il].bo) {
                 cb(cur, "kqv_wo", il);
             }
@@ -9960,7 +9960,7 @@ struct llm_build_context {
 
             // self-attention
             {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -10023,7 +10023,7 @@ struct llm_build_context {
                 LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -10070,7 +10070,7 @@ struct llm_build_context {
             {
                 cur = attn_norm;
 
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 if (model.layers[il].bqkv){
@@ -10163,7 +10163,7 @@ struct llm_build_context {
                 LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -10203,21 +10203,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -10313,7 +10313,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -10348,7 +10348,7 @@ struct llm_build_context {
 
             // self-attention
             {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -10425,7 +10425,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -10463,17 +10463,17 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                 cb(Vcur, "Vcur", il);
@@ -10537,7 +10537,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -10578,17 +10578,17 @@ struct llm_build_context {
             // self_attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                 cb(Vcur, "Vcur", il);
@@ -10643,7 +10643,7 @@ struct llm_build_context {
 
             // FFN shared expert
             {
-                ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
+                ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
                 cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
 
                 // sigmoid
@@ -10683,7 +10683,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -10725,7 +10725,7 @@ struct llm_build_context {
                 struct ggml_tensor * Vcur = nullptr;
 
                 if (model.layers[il].wqkv) {
-                    cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
+                    cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
                     cb(cur, "wqkv", il);
 
                     cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -10735,9 +10735,9 @@ struct llm_build_context {
                     Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                     Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
                 } else {
-                    Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
-                    Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
-                    Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+                    Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+                    Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+                    Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
                 }
 
                 cb(Qcur, "Qcur", il);
@@ -10803,7 +10803,7 @@ struct llm_build_context {
                 LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output_no_bias", -1);
 
         cur = ggml_add(ctx0, cur, model.output_b);
@@ -10849,7 +10849,7 @@ struct llm_build_context {
                 struct ggml_tensor * Vcur = nullptr;
 
                 if (model.layers[il].wqkv) {
-                    cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
+                    cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
                     cb(cur, "wqkv", il);
 
                     Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
@@ -10857,9 +10857,9 @@ struct llm_build_context {
                     Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
                 }
                 else {
-                    Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
-                    Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
-                    Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+                    Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+                    Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+                    Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
                 }
 
                 cb(Qcur, "Qcur", il);
@@ -10931,7 +10931,7 @@ struct llm_build_context {
             LLM_NORM_RMS, cb, -1);
         cb(cur, "result_norm", -1);
 
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -10971,13 +10971,13 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -11036,7 +11036,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -11078,7 +11078,7 @@ struct llm_build_context {
 
             // self-attention
             {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -11141,7 +11141,7 @@ struct llm_build_context {
                 LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -11177,7 +11177,7 @@ struct llm_build_context {
 
             // self-attention
             {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -11252,7 +11252,7 @@ struct llm_build_context {
                 LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -11290,21 +11290,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 // if (model.layers[il].bq) {
                 //     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                 //     cb(Qcur, "Qcur", il);
                 // }
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 // if (model.layers[il].bk) {
                 //     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                 //     cb(Kcur, "Kcur", il);
                 // }
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 // if (model.layers[il].bv) {
                 //     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -11370,7 +11370,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -11408,21 +11408,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -11488,7 +11488,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -11539,21 +11539,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -11635,7 +11635,7 @@ struct llm_build_context {
         cb(cur, "lmhead_scaling", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -11672,13 +11672,13 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -11743,7 +11743,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -11785,13 +11785,13 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -11866,7 +11866,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
 
         // final logit soft-capping
         cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
@@ -11911,21 +11911,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -11992,7 +11992,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -12044,7 +12044,7 @@ struct llm_build_context {
             cb(cur, "attn_norm", il);
 
             // {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens}
-            struct ggml_tensor * xz = ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur);
+            struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_in, cur);
             // split the above in two
             // => {d_inner, n_tokens}
             struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
@@ -12083,14 +12083,14 @@ struct llm_build_context {
             // ssm
             {
                 // {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens}
-                struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x);
+                struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_x, x);
                 // split
                 struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0);
                 struct ggml_tensor * B  = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
                 struct ggml_tensor * C  = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
 
                 // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
-                dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt);
+                dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt);
                 dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
 
                 // Custom operator to optimize the parallel associative scan
@@ -12121,7 +12121,7 @@ struct llm_build_context {
                 y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
 
                 // {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens}
-                cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_out, y);
             }
 
             // residual
@@ -12140,7 +12140,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -12179,21 +12179,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -12283,7 +12283,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
 
         if (f_logit_scale) {
             cur = ggml_scale(ctx0, cur, f_logit_scale);
@@ -12336,21 +12336,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (hparams.f_clamp_kqv > 0.0f) {
                     Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (hparams.f_clamp_kqv > 0.0f) {
                     Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (hparams.f_clamp_kqv > 0.0f) {
                     Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -12419,7 +12419,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -12459,7 +12459,7 @@ struct llm_build_context {
 
             // self-attention
             {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
@@ -12544,7 +12544,7 @@ struct llm_build_context {
                 LLM_NORM_RMS, cb, -1);
         cb(cur, "result_norm", -1);
 
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -12579,7 +12579,7 @@ struct llm_build_context {
 
             // self-attention
             {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -12686,7 +12686,7 @@ struct llm_build_context {
                 LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -12727,13 +12727,13 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -12818,7 +12818,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -13087,7 +13087,7 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
@@ -13096,7 +13096,7 @@ struct llm_build_context {
                 }
 
                 // B1.K
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
@@ -13105,7 +13105,7 @@ struct llm_build_context {
                 }
 
                 // B1.V
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
@@ -13136,7 +13136,7 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "attn_sub_norm", il);
 
-                cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
                 cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
                 if (model.layers[il].bo) {
                     cur = ggml_add(ctx0, cur, model.layers[il].bo);
@@ -13173,7 +13173,7 @@ struct llm_build_context {
                             LLM_NORM_RMS, cb, il);
             cb(cur, "ffn_sub_norm", il);
 
-            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
+            cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
             cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
             cb(cur, "ffn_down", il);
 
@@ -13192,7 +13192,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -13540,7 +13540,7 @@ struct llm_build_context {
 
             // self-attention
             {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -13599,7 +13599,7 @@ struct llm_build_context {
                 LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
 
         cb(cur, "result_output", -1);
 
@@ -13641,7 +13641,7 @@ struct llm_build_context {
                 struct ggml_tensor * Kcur = nullptr;
                 struct ggml_tensor * Vcur = nullptr;
 
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -13714,7 +13714,7 @@ struct llm_build_context {
                 LLM_NORM_RMS, cb, -1);
         cb(cur, "result_norm", -1);
 
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);

From 42415a4874e0f963e4aca6796ea5dfb97cd17464 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 15 Jul 2024 11:41:18 +0200
Subject: [PATCH 27/33] auto scale

---
 common/common.cpp       |  5 ++++-
 convert_lora_to_gguf.py |  2 ++
 include/llama.h         | 35 +++++++++++++++++++++--------------
 src/llama.cpp           | 10 +++++++++-
 4 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index dac152c4fc4b6..4cc71179c8dca 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -684,7 +684,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "--lora") {
         CHECK_ARG
-        params.lora_adapter.emplace_back(argv[i], 1.0f);
+        params.lora_adapter.emplace_back(argv[i], 0.0f);
         return true;
     }
     if (arg == "--lora-scaled") {
@@ -2089,6 +2089,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
             llama_free_model(model);
             return std::make_tuple(nullptr, nullptr);
         }
+        if (lora_scale == 0.0f) {
+            lora_scale = llama_lora_adapter_get_default_scale(adapter);
+        }
         llama_lora_adapter_set(lctx, adapter, lora_scale);
     }
 
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 71d3e57f55720..be0b6f272084d 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -366,9 +366,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             lparams: dict[str, Any] = json.load(f)
 
         alpha = lparams["lora_alpha"]
+        rank = lparams["r"]
 
         model_instance.gguf_writer.add_string("training.type", "finetune_lora")
         model_instance.gguf_writer.add_float32("training.lora.alpha", float(alpha))
+        model_instance.gguf_writer.add_float32("training.lora.scale", float(alpha) / float(rank))
 
         model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
         logger.info("Exporting model...")
diff --git a/include/llama.h b/include/llama.h
index c57d21f0c70b9..01ea884669cb9 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -513,12 +513,33 @@ extern "C" {
             const char * fname_out,
             const llama_model_quantize_params * params);
 
+    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
+    // the currently loaded vector.
+    // n_embd should be the size of a single layer's control, and data should point
+    // to an n_embd x n_layers buffer starting from layer 1.
+    // il_start and il_end are the layer range the vector should apply to (both inclusive)
+    // See llama_control_vector_load in common to load a control vector.
+    LLAMA_API int32_t llama_control_vector_apply(
+            struct llama_context * lctx,
+                     const float * data,
+                          size_t   len,
+                         int32_t   n_embd,
+                         int32_t   il_start,
+                         int32_t   il_end);
+
+    //
+    // LoRA
+    //
+
     // Load a LoRA adapter from file
     // The loaded adapter will be associated to the given model, and will be free when the model is deleted
     LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
             struct llama_model * model,
             const char * path_lora);
 
+    // Get default scale of an adapter
+    LLAMA_API float llama_lora_adapter_get_default_scale(struct llama_lora_adapter * adapter);
+
     // Add a loaded LoRA adapter to given context
     // This will not modify model's weight
     LLAMA_API int32_t llama_lora_adapter_set(
@@ -536,20 +557,6 @@ extern "C" {
     // Note: loaded adapters will be free when the associated model is deleted
     LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
 
-    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
-    // the currently loaded vector.
-    // n_embd should be the size of a single layer's control, and data should point
-    // to an n_embd x n_layers buffer starting from layer 1.
-    // il_start and il_end are the layer range the vector should apply to (both inclusive)
-    // See llama_control_vector_load in common to load a control vector.
-    LLAMA_API int32_t llama_control_vector_apply(
-            struct llama_context * lctx,
-                     const float * data,
-                          size_t   len,
-                         int32_t   n_embd,
-                         int32_t   il_start,
-                         int32_t   il_end);
-
     //
     // KV cache
     //
diff --git a/src/llama.cpp b/src/llama.cpp
index d5a7bb62bda2b..4c77b101437e0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -380,6 +380,7 @@ enum llm_kv {
 
     LLM_KV_TRAINING_TYPE,
     LLM_KV_TRAINING_LORA_ALPHA,
+    LLM_KV_TRAINING_LORA_SCALE,
 };
 
 static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -476,6 +477,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
     { LLM_KV_TRAINING_TYPE,                  "training.type"       },
     { LLM_KV_TRAINING_LORA_ALPHA,            "training.lora.alpha" },
+    { LLM_KV_TRAINING_LORA_SCALE,            "training.lora.scale" },
 };
 
 struct LLM_KV {
@@ -2851,6 +2853,7 @@ struct llama_lora_adapter {
     std::vector<ggml_backend_buffer_t> bufs;
 
     float alpha;
+    float scale; // default scale
 
     llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
         base_model->lora_adapters.insert(this);
@@ -18578,7 +18581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 }
 
 static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
-    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);
+    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
 
     ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
@@ -18615,6 +18618,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         }
 
         adapter.alpha = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_ALPHA));
+        adapter.scale = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_SCALE));
     }
 
     int n_tensors = gguf_get_n_tensors(ctx_gguf);
@@ -18749,6 +18753,10 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
     ggml_free(ctx);
 }
 
+float llama_lora_adapter_get_default_scale(struct llama_lora_adapter * adapter) {
+    return adapter->scale;
+}
+
 int32_t llama_lora_adapter_set(
             struct llama_context * ctx,
             struct llama_lora_adapter * adapter,

From 5b181182488796da836651fa8e053ca7fcb34192 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 15 Jul 2024 11:48:51 +0200
Subject: [PATCH 28/33] Revert "auto scale"

This reverts commit 42415a4874e0f963e4aca6796ea5dfb97cd17464.
---
 common/common.cpp       |  5 +----
 convert_lora_to_gguf.py |  2 --
 include/llama.h         | 35 ++++++++++++++---------------------
 src/llama.cpp           | 10 +---------
 4 files changed, 16 insertions(+), 36 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 4cc71179c8dca..dac152c4fc4b6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -684,7 +684,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "--lora") {
         CHECK_ARG
-        params.lora_adapter.emplace_back(argv[i], 0.0f);
+        params.lora_adapter.emplace_back(argv[i], 1.0f);
         return true;
     }
     if (arg == "--lora-scaled") {
@@ -2089,9 +2089,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
             llama_free_model(model);
             return std::make_tuple(nullptr, nullptr);
         }
-        if (lora_scale == 0.0f) {
-            lora_scale = llama_lora_adapter_get_default_scale(adapter);
-        }
         llama_lora_adapter_set(lctx, adapter, lora_scale);
     }
 
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index be0b6f272084d..71d3e57f55720 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -366,11 +366,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             lparams: dict[str, Any] = json.load(f)
 
         alpha = lparams["lora_alpha"]
-        rank = lparams["r"]
 
         model_instance.gguf_writer.add_string("training.type", "finetune_lora")
         model_instance.gguf_writer.add_float32("training.lora.alpha", float(alpha))
-        model_instance.gguf_writer.add_float32("training.lora.scale", float(alpha) / float(rank))
 
         model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
         logger.info("Exporting model...")
diff --git a/include/llama.h b/include/llama.h
index 01ea884669cb9..c57d21f0c70b9 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -513,33 +513,12 @@ extern "C" {
             const char * fname_out,
             const llama_model_quantize_params * params);
 
-    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
-    // the currently loaded vector.
-    // n_embd should be the size of a single layer's control, and data should point
-    // to an n_embd x n_layers buffer starting from layer 1.
-    // il_start and il_end are the layer range the vector should apply to (both inclusive)
-    // See llama_control_vector_load in common to load a control vector.
-    LLAMA_API int32_t llama_control_vector_apply(
-            struct llama_context * lctx,
-                     const float * data,
-                          size_t   len,
-                         int32_t   n_embd,
-                         int32_t   il_start,
-                         int32_t   il_end);
-
-    //
-    // LoRA
-    //
-
     // Load a LoRA adapter from file
     // The loaded adapter will be associated to the given model, and will be free when the model is deleted
     LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
             struct llama_model * model,
             const char * path_lora);
 
-    // Get default scale of an adapter
-    LLAMA_API float llama_lora_adapter_get_default_scale(struct llama_lora_adapter * adapter);
-
     // Add a loaded LoRA adapter to given context
     // This will not modify model's weight
     LLAMA_API int32_t llama_lora_adapter_set(
@@ -557,6 +536,20 @@ extern "C" {
     // Note: loaded adapters will be free when the associated model is deleted
     LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
 
+    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
+    // the currently loaded vector.
+    // n_embd should be the size of a single layer's control, and data should point
+    // to an n_embd x n_layers buffer starting from layer 1.
+    // il_start and il_end are the layer range the vector should apply to (both inclusive)
+    // See llama_control_vector_load in common to load a control vector.
+    LLAMA_API int32_t llama_control_vector_apply(
+            struct llama_context * lctx,
+                     const float * data,
+                          size_t   len,
+                         int32_t   n_embd,
+                         int32_t   il_start,
+                         int32_t   il_end);
+
     //
     // KV cache
     //
diff --git a/src/llama.cpp b/src/llama.cpp
index 4c77b101437e0..d5a7bb62bda2b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -380,7 +380,6 @@ enum llm_kv {
 
     LLM_KV_TRAINING_TYPE,
     LLM_KV_TRAINING_LORA_ALPHA,
-    LLM_KV_TRAINING_LORA_SCALE,
 };
 
 static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -477,7 +476,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
     { LLM_KV_TRAINING_TYPE,                  "training.type"       },
     { LLM_KV_TRAINING_LORA_ALPHA,            "training.lora.alpha" },
-    { LLM_KV_TRAINING_LORA_SCALE,            "training.lora.scale" },
 };
 
 struct LLM_KV {
@@ -2853,7 +2851,6 @@ struct llama_lora_adapter {
     std::vector<ggml_backend_buffer_t> bufs;
 
     float alpha;
-    float scale; // default scale
 
     llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
         base_model->lora_adapters.insert(this);
@@ -18581,7 +18578,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 }
 
 static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
-    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
+    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);
 
     ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
@@ -18618,7 +18615,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
         }
 
         adapter.alpha = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_ALPHA));
-        adapter.scale = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_SCALE));
     }
 
     int n_tensors = gguf_get_n_tensors(ctx_gguf);
@@ -18753,10 +18749,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
     ggml_free(ctx);
 }
 
-float llama_lora_adapter_get_default_scale(struct llama_lora_adapter * adapter) {
-    return adapter->scale;
-}
-
 int32_t llama_lora_adapter_set(
             struct llama_context * ctx,
             struct llama_lora_adapter * adapter,

From f68d092459059df92b0ba68b0b64282c1d56c56d Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 15 Jul 2024 12:12:22 +0200
Subject: [PATCH 29/33] remove redundant params

---
 src/llama.cpp | 104 +++++++++++++++++++++++++-------------------------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index d5a7bb62bda2b..bc2d53c967add 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8079,8 +8079,8 @@ static struct ggml_tensor * llm_build_ffn(
 }
 
 static struct ggml_tensor * llm_build_moe_ffn(
-       struct llama_context & lctx,
         struct ggml_context * ctx,
+       struct llama_context & lctx,
          struct ggml_tensor * cur,
          struct ggml_tensor * gate_inp,
          struct ggml_tensor * up_exps,
@@ -8180,11 +8180,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
 }
 
 static struct ggml_tensor * llm_build_kqv(
-       struct llama_context & lctx,
         struct ggml_context * ctx,
-          const llama_model & model,
-        const llama_hparams & hparams,
-        const llama_cparams & cparams,
+       struct llama_context & lctx,
        const llama_kv_cache & kv,
          struct ggml_cgraph * graph,
          struct ggml_tensor * wo,
@@ -8196,6 +8193,10 @@ static struct ggml_tensor * llm_build_kqv(
                     float     kq_scale,
          const llm_build_cb & cb,
                     int       il) {
+    const llama_model   & model   = lctx.model;
+    const llama_hparams & hparams = lctx.model.hparams;
+    const llama_cparams & cparams = lctx.cparams;
+
     const int64_t n_ctx         = cparams.n_ctx;
     const int64_t n_head        = hparams.n_head(il);
     const int64_t n_head_kv     = hparams.n_head_kv(il);
@@ -8309,11 +8310,8 @@ static struct ggml_tensor * llm_build_kqv(
 }
 
 static struct ggml_tensor * llm_build_kv(
-       struct llama_context & lctx,
         struct ggml_context * ctx,
-          const llama_model & model,
-        const llama_hparams & hparams,
-        const llama_cparams & cparams,
+       struct llama_context & lctx,
        const llama_kv_cache & kv,
          struct ggml_cgraph * graph,
          struct ggml_tensor * wo,
@@ -8328,6 +8326,8 @@ static struct ggml_tensor * llm_build_kv(
                     float     kq_scale,
          const llm_build_cb & cb,
                     int       il) {
+    const llama_hparams & hparams = lctx.model.hparams;
+    const llama_cparams & cparams = lctx.cparams;
 
     // these nodes are added to the graph together so that they are not reordered
     // by doing so, the number of splits in the graph is reduced
@@ -8339,7 +8339,7 @@ static struct ggml_tensor * llm_build_kv(
 
     struct ggml_tensor * cur;
 
-    cur  = llm_build_kqv(lctx, ctx, model, hparams, cparams, kv, graph, wo, wo_b,
+    cur  = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b,
             q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
     cb(cur, "kqv_out", il);
 
@@ -8836,7 +8836,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -8873,7 +8873,7 @@ struct llm_build_context {
                         LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_moe_ffn(lctx, ctx0, cur,
+                cur = llm_build_moe_ffn(ctx0, lctx, cur,
                         model.layers[il].ffn_gate_inp,
                         model.layers[il].ffn_up_exps,
                         model.layers[il].ffn_gate_exps,
@@ -8971,7 +8971,7 @@ struct llm_build_context {
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -9076,7 +9076,7 @@ struct llm_build_context {
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -9197,7 +9197,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -9321,7 +9321,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
@@ -9353,7 +9353,7 @@ struct llm_build_context {
                     LLM_NORM_RMS, cb, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_moe_ffn(lctx, ctx0, cur,
+            cur = llm_build_moe_ffn(ctx0, lctx, cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -9471,7 +9471,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -9494,7 +9494,7 @@ struct llm_build_context {
                                  LLM_NORM, cb, il);
             cb(cur, "attn_out_norm", il);
 
-            cur = llm_build_moe_ffn(lctx, ctx0, cur,
+            cur = llm_build_moe_ffn(ctx0, lctx, cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -9581,7 +9581,7 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -9675,7 +9675,7 @@ struct llm_build_context {
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                 cb(Qcur, "Qcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -9970,7 +9970,7 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -10102,13 +10102,13 @@ struct llm_build_context {
                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                     Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
 
-                    cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                    cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                             model.layers[il].wo, model.layers[il].bo,
                             Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 } else {
                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                    cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                    cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                             model.layers[il].wo, model.layers[il].bo,
                             Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 }
@@ -10253,7 +10253,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -10372,7 +10372,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -10486,7 +10486,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -10601,7 +10601,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -10624,7 +10624,7 @@ struct llm_build_context {
             cb(cur, "ffn_norm", il);
 
             ggml_tensor * moe_out =
-                    llm_build_moe_ffn(lctx, ctx0, cur,
+                    llm_build_moe_ffn(ctx0, lctx, cur,
                         model.layers[il].ffn_gate_inp,
                         model.layers[il].ffn_up_exps,
                         model.layers[il].ffn_gate_exps,
@@ -10758,7 +10758,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
@@ -10878,7 +10878,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
@@ -10986,7 +10986,7 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -11088,7 +11088,7 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -11199,7 +11199,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -11319,7 +11319,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -11437,7 +11437,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -11568,7 +11568,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -11690,7 +11690,7 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
@@ -11808,7 +11808,7 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
@@ -11945,7 +11945,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -12238,7 +12238,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -12370,7 +12370,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -12497,7 +12497,7 @@ struct llm_build_context {
                 Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
                 cb(Qcur, "Vcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -12606,7 +12606,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -12749,7 +12749,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
@@ -12788,7 +12788,7 @@ struct llm_build_context {
                     LLM_NORM_RMS, cb, il);
             cb(cur, "ffn_norm_exps", il);
 
-            cur = llm_build_moe_ffn(lctx, ctx0, cur,
+            cur = llm_build_moe_ffn(ctx0, lctx, cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -12971,7 +12971,7 @@ struct llm_build_context {
                 struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
                 cb(k_states, "k_states", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
             }
@@ -13008,7 +13008,7 @@ struct llm_build_context {
                 cb(cur, "ffn_norm", il);
 
                 ggml_tensor * moe_out =
-                        llm_build_moe_ffn(lctx, ctx0, cur,
+                        llm_build_moe_ffn(ctx0, lctx, cur,
                             model.layers[il].ffn_gate_inp,
                             model.layers[il].ffn_up_exps,
                             model.layers[il].ffn_gate_exps,
@@ -13126,7 +13126,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         NULL, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
@@ -13555,7 +13555,7 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il);
             }
@@ -13668,7 +13668,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur_rope", il);
 
-                cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf,
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
@@ -18578,7 +18578,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 }
 
 static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
-    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);
+    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
 
     ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {

From 9175f4b77c4166b964f5eaffbd2da19a91576b71 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Mon, 15 Jul 2024 15:02:46 +0200
Subject: [PATCH 30/33] Apply suggestions from code review

Co-authored-by: slaren <slarengh@gmail.com>
---
 src/llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index d047d1a3117d2..f94bee142fcff 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7886,7 +7886,7 @@ static struct ggml_tensor * llm_build_lora_mm(
             ctx0, lora->b,
             ggml_mul_mat(ctx0, lora->a, cur)
         );
-        ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale);
+        ab_cur = ggml_scale(ctx0, ab_cur, scale);
         res = ggml_add(ctx0, res, ab_cur);
     }
     return res;
@@ -7913,7 +7913,7 @@ static struct ggml_tensor * llm_build_lora_mm_id(
             ggml_mul_mat_id(ctx0, lora->a, cur, ids),
             ids
         );
-        ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale);
+        ab_cur = ggml_scale(ctx0, ab_cur, scale);
         res = ggml_add(ctx0, res, ab_cur);
     }
     return res;

From 0ba23bad6f2169c90b94a605a4d72614821ad7cc Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 15 Jul 2024 15:35:19 +0200
Subject: [PATCH 31/33] change kv metadata

---
 convert_hf_to_gguf.py       |  1 +
 convert_lora_to_gguf.py     |  9 ++++-----
 gguf-py/gguf/constants.py   |  8 ++++++++
 gguf-py/gguf/gguf_writer.py |  3 +++
 src/llama.cpp               | 31 ++++++++++++++++++++-----------
 5 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index c5eb7bdbb9bce..a66228d71ed31 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -186,6 +186,7 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
         return new_name
 
     def set_gguf_parameters(self):
+        self.gguf_writer.add_type(gguf.GGUFType.MODEL)
         self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
         self.gguf_writer.add_block_count(self.block_count)
 
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 71d3e57f55720..bfd252d2e7ecd 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -359,17 +359,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             eager=args.no_lazy,
             model_name=None,
         )
-        logger.info("Set model parameters")
-        model_instance.set_gguf_parameters()
 
         with open(lora_config, "r") as f:
             lparams: dict[str, Any] = json.load(f)
 
         alpha = lparams["lora_alpha"]
 
-        model_instance.gguf_writer.add_string("training.type", "finetune_lora")
-        model_instance.gguf_writer.add_float32("training.lora.alpha", float(alpha))
-
+        model_instance.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[model_instance.model_arch])
+        model_instance.gguf_writer.add_string(gguf.Keys.General.TYPE, gguf.GGUFType.ADAPTER)
+        model_instance.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
+        model_instance.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))
         model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
         logger.info("Exporting model...")
         model_instance.write()
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index a95a44237e348..390d2d1890e2a 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -19,6 +19,7 @@
 
 class Keys:
     class General:
+        TYPE                 = "general.type"
         ARCHITECTURE         = "general.architecture"
         QUANTIZATION_VERSION = "general.quantization_version"
         ALIGNMENT            = "general.alignment"
@@ -120,10 +121,17 @@ class Tokenizer:
         MIDDLE_ID            = "tokenizer.ggml.middle_token_id"
         EOT_ID               = "tokenizer.ggml.eot_token_id"
 
+    class Adapter:
+        TYPE       = "adapter.type"
+        LORA_ALPHA = "adapter.lora.alpha"
+
 #
 # recommended mapping of model tensor names for storage in gguf
 #
 
+class GGUFType:
+    MODEL   = "model"
+    ADAPTER = "adapter"
 
 class MODEL_ARCH(IntEnum):
     LLAMA        = auto()
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index cf95541629032..b0197961d46a8 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -424,6 +424,9 @@ def close(self) -> None:
                 fout.close()
             self.fout = None
 
+    def add_type(self, type_name: str) -> None:
+        self.add_string(Keys.General.TYPE, type_name)
+
     def add_architecture(self) -> None:
         self.add_string(Keys.General.ARCHITECTURE, self.arch)
 
diff --git a/src/llama.cpp b/src/llama.cpp
index f94bee142fcff..07bb427135d8c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -287,6 +287,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
 };
 
 enum llm_kv {
+    LLM_KV_GENERAL_TYPE,
     LLM_KV_GENERAL_ARCHITECTURE,
     LLM_KV_GENERAL_QUANTIZATION_VERSION,
     LLM_KV_GENERAL_ALIGNMENT,
@@ -378,11 +379,12 @@ enum llm_kv {
     LLM_KV_TOKENIZER_MIDDLE_ID,
     LLM_KV_TOKENIZER_EOT_ID,
 
-    LLM_KV_TRAINING_TYPE,
-    LLM_KV_TRAINING_LORA_ALPHA,
+    LLM_KV_ADAPTER_TYPE,
+    LLM_KV_ADAPTER_LORA_ALPHA,
 };
 
 static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+    { LLM_KV_GENERAL_TYPE,                  "general.type"                          },
     { LLM_KV_GENERAL_ARCHITECTURE,          "general.architecture"                  },
     { LLM_KV_GENERAL_QUANTIZATION_VERSION,  "general.quantization_version"          },
     { LLM_KV_GENERAL_ALIGNMENT,             "general.alignment"                     },
@@ -474,8 +476,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_MIDDLE_ID,            "tokenizer.ggml.middle_token_id"          },
     { LLM_KV_TOKENIZER_EOT_ID,               "tokenizer.ggml.eot_token_id"             },
 
-    { LLM_KV_TRAINING_TYPE,                  "training.type"       },
-    { LLM_KV_TRAINING_LORA_ALPHA,            "training.lora.alpha" },
+    { LLM_KV_ADAPTER_TYPE,                  "adapter.type"       },
+    { LLM_KV_ADAPTER_LORA_ALPHA,            "adapter.lora.alpha" },
 };
 
 struct LLM_KV {
@@ -18596,20 +18598,27 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
             return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
         };
         LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
-        auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
-        auto lora_arch = llm_arch_from_string(lora_arch_name);
-        if (lora_arch != model->arch) {
+
+        auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
+        if (general_type != "adapter") {
+            gguf_free(ctx_gguf);
+            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
+        }
+
+        auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
+        auto general_arch = llm_arch_from_string(general_arch_str);
+        if (general_arch != model->arch) {
             gguf_free(ctx_gguf);
             throw std::runtime_error("model arch and LoRA arch mismatch");
         }
 
-        auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE));
-        if (train_type != "finetune_lora") {
+        auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
+        if (adapter_type != "lora") {
             gguf_free(ctx_gguf);
-            throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type);
+            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
         }
 
-        adapter.alpha = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_ALPHA));
+        adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
     }
 
     int n_tensors = gguf_get_n_tensors(ctx_gguf);

From b1c40695029f4317a8c1598d1523631b60755b44 Mon Sep 17 00:00:00 2001
From: ngxson <contact@ngxson.com>
Date: Mon, 15 Jul 2024 17:22:38 +0200
Subject: [PATCH 32/33] move add_type to __init__

---
 convert_hf_to_gguf.py     | 2 +-
 gguf-py/gguf/constants.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a66228d71ed31..ecf5aa234caca 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -99,6 +99,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
         self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
         self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
                                            split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
+        self.gguf_writer.add_type(gguf.GGUFType.MODEL)
 
     @classmethod
     def __init_subclass__(cls):
@@ -186,7 +187,6 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
         return new_name
 
     def set_gguf_parameters(self):
-        self.gguf_writer.add_type(gguf.GGUFType.MODEL)
         self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
         self.gguf_writer.add_block_count(self.block_count)
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 390d2d1890e2a..5eb3df706e6e2 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -129,10 +129,12 @@ class Adapter:
 # recommended mapping of model tensor names for storage in gguf
 #
 
+
 class GGUFType:
     MODEL   = "model"
     ADAPTER = "adapter"
 
+
 class MODEL_ARCH(IntEnum):
     LLAMA        = auto()
     FALCON       = auto()

From d09382fac7ec9eafa8b94c656ec55feb250e4bee Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 15 Jul 2024 11:39:42 -0400
Subject: [PATCH 33/33] convert_hf : move add_type to main()

* convert_lora : use the GGUFWriter from Model instead of overwriting it
---
 convert_hf_to_gguf.py   | 2 +-
 convert_lora_to_gguf.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ecf5aa234caca..a755b0a60bf0a 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -99,7 +99,6 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
         self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
         self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
                                            split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
-        self.gguf_writer.add_type(gguf.GGUFType.MODEL)
 
     @classmethod
     def __init_subclass__(cls):
@@ -3575,6 +3574,7 @@ def main() -> None:
                                      small_first_shard=args.no_tensor_first_split)
 
         logger.info("Set model parameters")
+        model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL)
         model_instance.set_gguf_parameters()
 
         logger.info("Set model tokenizer")
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index bfd252d2e7ecd..4bb939d45d6bd 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -365,7 +365,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         alpha = lparams["lora_alpha"]
 
-        model_instance.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[model_instance.model_arch])
         model_instance.gguf_writer.add_string(gguf.Keys.General.TYPE, gguf.GGUFType.ADAPTER)
         model_instance.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
         model_instance.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))