From 67c5e14d069fba61a424f6d782de2d49bf2a8722 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 6 Jul 2024 02:12:53 +0200 Subject: [PATCH 01/33] lora: load to devide buft --- common/common.cpp | 10 +- include/llama.h | 13 +- src/llama.cpp | 411 ++++++++++++++++++---------------------------- 3 files changed, 166 insertions(+), 268 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c548bcb2857a8..d3eec6aa783b3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2063,14 +2063,8 @@ std::tuple llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); - int err = llama_model_apply_lora_from_file(model, - lora_adapter.c_str(), - lora_scale, - ((i > 0) || params.lora_base.empty()) - ? NULL - : params.lora_base.c_str(), - params.n_threads); - if (err != 0) { + auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str()); + if (adapter == nullptr) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); llama_free_model(model); diff --git a/include/llama.h b/include/llama.h index 865ace9944d02..077d902837c49 100644 --- a/include/llama.h +++ b/include/llama.h @@ -406,6 +406,9 @@ extern "C" { const char * content; } llama_chat_message; + // lora adapter + struct llama_lora_adapter; + // Helpers for getting default parameters LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void); @@ -510,13 +513,9 @@ extern "C" { // the layers modified by the adapter. Can be NULL to use the current loaded model. // The model needs to be reloaded before applying a new adapter, otherwise the adapter // will be applied on top of the previous one - // Returns 0 on success - LLAMA_API int32_t llama_model_apply_lora_from_file( - const struct llama_model * model, - const char * path_lora, - float scale, - const char * path_base_model, - int32_t n_threads); + LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( + struct llama_context * ctx, + const char * path_lora); // Apply a loaded control vector to a llama_context, or if data is NULL, clear // the currently loaded vector. diff --git a/src/llama.cpp b/src/llama.cpp index b770ca5bc33fc..ec89b2778ea08 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2547,6 +2547,29 @@ struct llama_control_vector { } }; +struct lora_weight { + struct ggml_tensor * a = nullptr; + struct ggml_tensor * b = nullptr; + lora_weight() {} + lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} +}; + +struct llama_lora_adapter { + // map tensor name to lora_a_b + std::map ab_map; + std::vector ctxs; + std::vector bufs; + + ~llama_lora_adapter() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + } +}; + struct llama_vocab { using id = int32_t; using token = std::string; @@ -2704,6 +2727,10 @@ struct llama_context { } ggml_backend_buffer_free(buf_output); + + for (auto adapter : lora_adapters) { + delete adapter; + } } llama_cparams cparams; @@ -2795,6 +2822,9 @@ struct llama_context { // control vectors struct llama_control_vector cvec; + + // lora adapters + std::vector lora_adapters; }; static size_t llama_get_device_count(const llama_model & model) { @@ -18243,281 +18273,149 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } -static int llama_apply_lora_from_file_internal( - const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads -) { +static int llama_lora_adapter_init_internal(const struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) { + static const int n_inp_tensors = 5; // see llama_model + static const int n_out_tensors = 5; // see llama_model LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); - const int64_t t_start_lora_us = ggml_time_us(); - - llama_file fin(path_lora, "rb"); - - // verify magic and version - { - uint32_t magic = fin.read_u32(); - if (magic != LLAMA_FILE_MAGIC_GGLA) { - LLAMA_LOG_ERROR("%s: bad file magic\n", __func__); - return 1; - } - - uint32_t format_version = fin.read_u32(); - if (format_version != 1) { - LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); - return 1; - } - } - - int32_t lora_r = fin.read_u32(); - int32_t lora_alpha = fin.read_u32(); - float scaling = scale * (float)lora_alpha / (float)lora_r; + // TODO: check lora base model arch - LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); - - // load base model - std::unique_ptr ml; - if (path_base_model) { - LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr)); - ml->init_mappings(/*prefetch*/ false); // no prefetching - } - - struct tensor_meta { - std::string name; - ggml_type type; - int32_t ne[2]; - size_t offset; + ggml_context * ctx = nullptr; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, + /* .ctx = */ &ctx, }; - std::map tensor_meta_map; - - // load all tensor meta - while (true) { - if (fin.tell() == fin.size) { - // eof - break; - } - - int32_t n_dims; - int32_t name_len; - int32_t ftype; - - fin.read_raw(&n_dims, sizeof(n_dims)); - fin.read_raw(&name_len, sizeof(name_len)); - fin.read_raw(&ftype, sizeof(ftype)); - - if (n_dims != 1 && n_dims != 2) { - LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); - return 1; - } - - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read_raw(&ne[i], sizeof(ne[i])); - } - - std::string name; - { - GGML_ASSERT(name_len < GGML_MAX_NAME); - char buf[GGML_MAX_NAME]; - fin.read_raw(buf, name_len); - name = std::string(buf, name_len); - } - - // check for lora suffix - std::string lora_suffix; - if (name.length() > 6) { - lora_suffix = name.substr(name.length() - 6); - } - if (lora_suffix != ".loraA" && lora_suffix != ".loraB") { - LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); - return 1; - } + struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); + if (!ctx_gguf) { + LLAMA_LOG_ERROR("%s: failed to load lora adapter file from %s\n", __func__, path_lora); + return -1; + } - // tensor type - ggml_type wtype; - switch (ftype) { - case 0: wtype = GGML_TYPE_F32; break; - case 1: wtype = GGML_TYPE_F16; break; - default: - { - LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", - __func__, ftype); - return 1; - } + // calculate n_tensors_per_layer + int n_tensors_per_layer = 0; + { + int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + int il = -1; + sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il); + if (il == 0) n_tensors_per_layer++; } - - // data offset - size_t offset = fin.tell(); - offset = (offset + 31) & -32; - - // skip tensor data - fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET); - - tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset }); } + printf("n_tensors_per_layer %d\n", n_tensors_per_layer); - bool warned = false; - int n_tensors = 0; - - // apply - ggml_backend_t backend_cpu = ggml_backend_cpu_init(); - if (backend_cpu == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__); - return 1; + // count layer buffer types + std::map buft_layer_count; + for (int64_t i = 0; i < model.hparams.n_layer; i++) { + buft_layer_count[model.buft_layer[i].buft]++; } - ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); - - std::vector> read_buf; - for (const auto & it : model.tensors_by_name) { - const std::string & base_name = it.first; - ggml_tensor * model_t = it.second; - - if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() || - tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) { - continue; - } - - tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA"); - tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB"); - ggml_init_params lora_init_params = { - /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), - /* .mem_buffer */ nullptr, - /* .no_alloc */ true, + // allocate contexts + std::map ctx_map; + { + auto new_ggml_ctx = [](size_t n_tensors) { + struct ggml_init_params params = { + /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + return ggml_init(params); }; - ggml_context * lora_ctx = ggml_init(lora_init_params); - if (lora_ctx == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__); - ggml_backend_free(backend_cpu); - return 1; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + printf("buf %p layers %d\n", it.first, it.second); + ctx_map[it.first] = new_ggml_ctx(2*n_layers*n_tensors_per_layer); } + //ctx_map[model.buft_input.buft] = new_ggml_ctx(2*n_inp_tensors); + //ctx_map[model.buft_output.buft] = new_ggml_ctx(2*n_out_tensors); + } - // create tensors - ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]); - ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]); - ggml_set_name(loraA, metaA.name.c_str()); - ggml_set_name(loraB, metaB.name.c_str()); - - ggml_tensor * base_t; - if (ml) { - if (!ml->get_tensor_meta(base_name.c_str())) { - LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); - return 1; + // bundle lora_a and lora_b into pairs + std::map ab_map; + auto str_endswith = [](const std::string & str, const std::string & suffix) { + return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; + }; + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string name(cur->name); + if (str_endswith(name, ".lora_a")) { + replace_all(name, ".lora_a", ""); + if (ab_map.find(name) == ab_map.end()) { + ab_map[name] = lora_weight(cur, nullptr); + } else { + ab_map[name].a = cur; + } + } else if (str_endswith(name, ".lora_b")) { + replace_all(name, ".lora_b", ""); + if (ab_map.find(name) == ab_map.end()) { + ab_map[name] = lora_weight(nullptr, cur); + } else { + ab_map[name].b = cur; } - base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str())); - } else { - base_t = ggml_dup_tensor(lora_ctx, model_t); - } - ggml_set_name(base_t, base_name.c_str()); - - // allocate in backend buffer - ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); - if (lora_buf == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__); - return 1; } + } - // load tensor data - auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) { - read_buf.resize(ggml_nbytes(tensor)); - fin.seek(tensor_meta.offset, SEEK_SET); - fin.read_raw(read_buf.data(), ggml_nbytes(tensor)); - ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size()); - }; - load_tensor(metaA, loraA); - load_tensor(metaB, loraB); - - // load base model tensor data - if (ml) { - ml->load_data_for(base_t); + // add tensors + for (auto & it : ab_map) { + std::string name = it.first; + lora_weight & w = it.second; + GGML_ASSERT(w.a != nullptr); + GGML_ASSERT(w.b != nullptr); + int il = -1; + sscanf(name.c_str(), "blk.%d.", &il); + if (il >= 0) { + printf("%s %p %p\n", name.c_str(), w.a, w.b); + struct ggml_context * dev_ctx = ctx_map.at(model.buft_layer[il].buft); + struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); + struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); + ggml_set_name(tensor_a, w.a->name); + ggml_set_name(tensor_b, w.b->name); + adapter.ab_map[name] = lora_weight(tensor_a, tensor_b); } else { - ggml_backend_tensor_copy(model_t, base_t); - } - - if (ggml_is_quantized(base_t->type) && !warned) { - LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " - "use a f16 or f32 base model with --lora-base\n", __func__); - warned = true; + // TODO: process output & token_embd tensors } + } - if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { - LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" - " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); - ggml_free(lora_ctx); - ggml_backend_buffer_free(lora_buf); - ggml_backend_free(backend_cpu); - return 1; - } - - auto build_lora_graph = [&]() { - // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); - ggml_set_name(BA, "BA"); - - if (scaling != 1.0f) { - BA = ggml_scale(lora_ctx, BA, scaling); - ggml_set_name(BA, "BA_scaled"); - } - - ggml_tensor * r; - r = ggml_add_inplace(lora_ctx, base_t, BA); - ggml_set_name(r, "r_add"); - - if (base_t->type != model_t->type) { - // convert the result to the model type - r = ggml_cast(lora_ctx, r, model_t->type); - ggml_set_name(r, "r_cast"); + // allocate tensors / buffers and zero + { + adapter.ctxs.reserve(ctx_map.size()); + adapter.bufs.reserve(ctx_map.size()); + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__); + return -1; } - - return r; - }; - - ggml_cgraph * gf = ggml_new_graph(lora_ctx); - ggml_tensor * r = build_lora_graph(); - ggml_build_forward_expand(gf, r); - - ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); - if (graph_buf == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__); - ggml_free(lora_ctx); - ggml_backend_buffer_free(lora_buf); - ggml_backend_free(backend_cpu); - return 1; + ggml_backend_buffer_clear(buf, 0); + adapter.ctxs.push_back(ctx); + adapter.bufs.push_back(buf); } + } - ggml_backend_graph_compute(backend_cpu, gf); - - ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); - -#if 0 - // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU - //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE); - - // sched compute - ggml_build_forward_expand(gf, build_graph()); - ggml_backend_sched_init_measure(sched, gf); - - // create the graph again, since the previous one was destroyed by the measure - ggml_graph_clear(gf); - ggml_build_forward_expand(gf, build_graph()); - ggml_backend_sched_graph_compute(sched, gf); - ggml_backend_sched_free(sched); -#endif - - ggml_backend_buffer_free(lora_buf); - ggml_backend_buffer_free(graph_buf); - ggml_free(lora_ctx); - - n_tensors++; - if (n_tensors % 4 == 0) { - LLAMA_LOG_INFO("."); + // set tensor data + { + llama_file gguf_file(path_lora, "rb"); + std::vector read_buf; + auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { + size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name)); + size_t size = ggml_nbytes(orig); + if (read_buf.size() < size) { + read_buf.resize(size); + } + gguf_file.read_raw(read_buf.data(), size); + printf("%s: %s size=%ld\n", __func__, orig->name, size); + return ggml_backend_tensor_set(dev, read_buf.data(), 0, size); + }; + for (auto & it : adapter.ab_map) { + auto orig = ab_map[it.first]; + auto dev = it.second; + set_tensor(orig.a, dev.a); + set_tensor(orig.b, dev.b); } } - ggml_backend_free(backend_cpu); - - const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; - LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); - + // free ctx for reading gguf + ggml_free(ctx); return 0; } @@ -19298,12 +19196,19 @@ uint32_t llama_model_quantize( } } -int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) { +struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora) { try { - return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads); + struct llama_lora_adapter * adapter = new llama_lora_adapter; + int res = llama_lora_adapter_init_internal(ctx->model, path_lora, *adapter); + if (res == 0) { + ctx->lora_adapters.push_back(adapter); + return adapter; + } else { + return nullptr; + } } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); - return 1; + return nullptr; } } From e9d7b6c05f928665cb9779629816128b8016418d Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 6 Jul 2024 12:07:29 +0200 Subject: [PATCH 02/33] add patch tensor function --- src/llama.cpp | 211 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 193 insertions(+), 18 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index ec89b2778ea08..d97eb3bb2fc63 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2702,6 +2702,10 @@ struct llama_model { int64_t t_load_us = 0; int64_t t_start_us = 0; + // used by lora, to save model's original tensors + std::vector orig_tensors; + std::vector orig_layers; + ~llama_model() { for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); @@ -13491,6 +13495,10 @@ static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) { return result; } +// forward declaration +static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build); +static int32_t llama_lora_restore_tensors(struct llama_context & lctx); + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch, @@ -13534,6 +13542,11 @@ static struct ggml_cgraph * llama_build_graph( llm.init(); + if (!lctx.lora_adapters.empty()) { + llama_lora_restore_tensors(lctx); + llama_lora_patch_tensors(lctx, llm.ctx0); + } + switch (model.arch) { case LLM_ARCH_LLAMA: { @@ -18304,10 +18317,12 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co printf("n_tensors_per_layer %d\n", n_tensors_per_layer); // count layer buffer types - std::map buft_layer_count; + std::map buft_tensor_count; for (int64_t i = 0; i < model.hparams.n_layer; i++) { - buft_layer_count[model.buft_layer[i].buft]++; + buft_tensor_count[model.buft_layer[i].buft] += n_tensors_per_layer; } + buft_tensor_count[model.buft_input.buft] += n_inp_tensors; + buft_tensor_count[model.buft_output.buft] += n_out_tensors; // allocate contexts std::map ctx_map; @@ -18320,13 +18335,11 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co }; return ggml_init(params); }; - for (auto & it : buft_layer_count) { - int n_layers = it.second; - printf("buf %p layers %d\n", it.first, it.second); - ctx_map[it.first] = new_ggml_ctx(2*n_layers*n_tensors_per_layer); + for (auto & it : buft_tensor_count) { + int n_tensors = it.second; + // LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second); + ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors } - //ctx_map[model.buft_input.buft] = new_ggml_ctx(2*n_inp_tensors); - //ctx_map[model.buft_output.buft] = new_ggml_ctx(2*n_out_tensors); } // bundle lora_a and lora_b into pairs @@ -18356,22 +18369,29 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co // add tensors for (auto & it : ab_map) { std::string name = it.first; + const char * cname = name.c_str(); lora_weight & w = it.second; GGML_ASSERT(w.a != nullptr); GGML_ASSERT(w.b != nullptr); int il = -1; - sscanf(name.c_str(), "blk.%d.", &il); + sscanf(cname, "blk.%d.", &il); + struct ggml_context * dev_ctx; // device ctx if (il >= 0) { - printf("%s %p %p\n", name.c_str(), w.a, w.b); - struct ggml_context * dev_ctx = ctx_map.at(model.buft_layer[il].buft); - struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); - struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); - ggml_set_name(tensor_a, w.a->name); - ggml_set_name(tensor_b, w.b->name); - adapter.ab_map[name] = lora_weight(tensor_a, tensor_b); + dev_ctx = ctx_map.at(model.buft_layer[il].buft); + } else if (strstr(cname, "tok") == 0) { + dev_ctx = ctx_map.at(model.buft_input.buft); + } else if (strstr(cname, "output") == 0) { + dev_ctx = ctx_map.at(model.buft_output.buft); } else { - // TODO: process output & token_embd tensors + LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname); + continue; } + // LLAMA_LOG_INFO("%s %p %p\n", cname, w.a, w.b); + struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); + struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); + ggml_set_name(tensor_a, w.a->name); + ggml_set_name(tensor_b, w.b->name); + adapter.ab_map[name] = lora_weight(tensor_a, tensor_b); } // allocate tensors / buffers and zero @@ -18402,8 +18422,9 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co if (read_buf.size() < size) { read_buf.resize(size); } + gguf_file.seek(offs, SEEK_SET); gguf_file.read_raw(read_buf.data(), size); - printf("%s: %s size=%ld\n", __func__, orig->name, size); + // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, orig->name, size); return ggml_backend_tensor_set(dev, read_buf.data(), 0, size); }; for (auto & it : adapter.ab_map) { @@ -18414,11 +18435,165 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co } } + LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2); + // free ctx for reading gguf ggml_free(ctx); return 0; } +static int32_t llama_lora_restore_tensors(struct llama_context & lctx) { + // TODO @ngxson : not ideal, but "const" is discarded to make it work + struct llama_model & model = const_cast(lctx.model); + if (!model.orig_tensors.empty()) { + size_t i = 0; + model.tok_embd = model.orig_tensors[i++]; + model.type_embd = model.orig_tensors[i++]; + model.pos_embd = model.orig_tensors[i++]; + model.tok_norm = model.orig_tensors[i++]; + model.tok_norm_b = model.orig_tensors[i++]; + model.output_norm = model.orig_tensors[i++]; + model.output_norm_b = model.orig_tensors[i++]; + model.output = model.orig_tensors[i++]; + model.output_b = model.orig_tensors[i++]; + model.output_norm_enc = model.orig_tensors[i++]; + for (size_t il = 0; il < model.orig_layers.size(); il++) { + model.layers[il] = model.orig_layers[il]; // copy + } + } +} + +static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build) { + GGML_ASSERT(!lctx.lora_adapters.empty()); + // TODO @ngxson : not ideal, but "const" is discarded to make it work + struct llama_model & model = const_cast(lctx.model); + + // save all original tensors + if (model.orig_tensors.empty()) { + model.orig_tensors.push_back(model.tok_embd); + model.orig_tensors.push_back(model.type_embd); + model.orig_tensors.push_back(model.pos_embd); + model.orig_tensors.push_back(model.tok_norm); + model.orig_tensors.push_back(model.tok_norm_b); + model.orig_tensors.push_back(model.output_norm); + model.orig_tensors.push_back(model.output_norm_b); + model.orig_tensors.push_back(model.output); + model.orig_tensors.push_back(model.output_b); + model.orig_tensors.push_back(model.output_norm_enc); + model.orig_layers.reserve(model.layers.size()); + for (llama_layer layer : model.layers) { + model.orig_layers.push_back(layer); // copy + } + } + + // patch tensors + auto patch_tensor = [&](struct llama_lora_adapter * adapter, struct ggml_tensor ** tensor) { + if (*tensor == nullptr) { + return; + } + std::string name = ggml_get_name(*tensor); + if (adapter->ab_map.find(name) != adapter->ab_map.end()) { + auto lora_w = adapter->ab_map[name]; + struct ggml_tensor * cur = ggml_mul_mat(ctx_build, lora_w.a, lora_w.b); + cur = ggml_add(ctx_build, cur, *tensor); + // TODO: scale + ggml_format_name(cur, "%s.merged", name.c_str()); + // LLAMA_LOG_INFO("LORA %s\n", cur->name); + tensor = &cur; + } + }; + for (auto adapter : lctx.lora_adapters) { + patch_tensor(adapter, &model.tok_embd); + patch_tensor(adapter, &model.type_embd); + patch_tensor(adapter, &model.pos_embd); + patch_tensor(adapter, &model.tok_norm); + patch_tensor(adapter, &model.tok_norm_b); + patch_tensor(adapter, &model.output_norm); + patch_tensor(adapter, &model.output_norm_b); + patch_tensor(adapter, &model.output); + patch_tensor(adapter, &model.output_b); + patch_tensor(adapter, &model.output_norm_enc); + for (llama_layer & layer : model.layers) { + patch_tensor(adapter, &layer.attn_norm); + patch_tensor(adapter, &layer.attn_norm_b); + patch_tensor(adapter, &layer.attn_norm_2); + patch_tensor(adapter, &layer.attn_norm_2_b); + patch_tensor(adapter, &layer.attn_q_norm); + patch_tensor(adapter, &layer.attn_q_norm_b); + patch_tensor(adapter, &layer.attn_k_norm); + patch_tensor(adapter, &layer.attn_k_norm_b); + patch_tensor(adapter, &layer.attn_out_norm); + patch_tensor(adapter, &layer.attn_out_norm_b); + patch_tensor(adapter, &layer.attn_q_a_norm); + patch_tensor(adapter, &layer.attn_kv_a_norm); + patch_tensor(adapter, &layer.attn_sub_norm); + patch_tensor(adapter, &layer.attn_post_norm); + patch_tensor(adapter, &layer.ffn_sub_norm); + patch_tensor(adapter, &layer.attn_norm_cross); + patch_tensor(adapter, &layer.attn_norm_enc); + + patch_tensor(adapter, &layer.wq); + patch_tensor(adapter, &layer.wk); + patch_tensor(adapter, &layer.wv); + patch_tensor(adapter, &layer.wo); + patch_tensor(adapter, &layer.wqkv); + patch_tensor(adapter, &layer.wq_a); + patch_tensor(adapter, &layer.wq_b); + patch_tensor(adapter, &layer.wkv_a_mqa); + patch_tensor(adapter, &layer.wkv_b); + patch_tensor(adapter, &layer.wq_cross); + patch_tensor(adapter, &layer.wk_cross); + patch_tensor(adapter, &layer.wv_cross); + patch_tensor(adapter, &layer.wo_cross); + patch_tensor(adapter, &layer.wq_enc); + patch_tensor(adapter, &layer.wk_enc); + patch_tensor(adapter, &layer.wv_enc); + patch_tensor(adapter, &layer.wo_enc); + + patch_tensor(adapter, &layer.bq); + patch_tensor(adapter, &layer.bk); + patch_tensor(adapter, &layer.bv); + patch_tensor(adapter, &layer.bo); + patch_tensor(adapter, &layer.bqkv); + + patch_tensor(adapter, &layer.attn_rel_b); + patch_tensor(adapter, &layer.attn_rel_b_enc); + patch_tensor(adapter, &layer.attn_rel_b_cross); + + patch_tensor(adapter, &layer.ffn_norm); + patch_tensor(adapter, &layer.ffn_norm_b); + patch_tensor(adapter, &layer.ffn_post_norm); + patch_tensor(adapter, &layer.layer_out_norm); + patch_tensor(adapter, &layer.layer_out_norm_b); + patch_tensor(adapter, &layer.ffn_norm_exps); + patch_tensor(adapter, &layer.ffn_norm_enc); + + patch_tensor(adapter, &layer.ffn_gate); + patch_tensor(adapter, &layer.ffn_down); + patch_tensor(adapter, &layer.ffn_up); + patch_tensor(adapter, &layer.ffn_gate_enc); + patch_tensor(adapter, &layer.ffn_down_enc); + patch_tensor(adapter, &layer.ffn_up_enc); + + patch_tensor(adapter, &layer.ffn_gate_inp); + patch_tensor(adapter, &layer.ffn_gate_exps); + patch_tensor(adapter, &layer.ffn_down_exps); + patch_tensor(adapter, &layer.ffn_up_exps ); + + patch_tensor(adapter, &layer.ffn_gate_inp_shexp); + patch_tensor(adapter, &layer.ffn_gate_shexp); + patch_tensor(adapter, &layer.ffn_down_shexp); + patch_tensor(adapter, &layer.ffn_up_shexp); + + patch_tensor(adapter, &layer.ffn_gate_b); + patch_tensor(adapter, &layer.ffn_down_b); + patch_tensor(adapter, &layer.ffn_up_b); + patch_tensor(adapter, &layer.ffn_act); + } + } + return 0; +} + // // interface implementation // From 4e28ad40a099c7f618abf8ae113c4e56ee7705e8 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 6 Jul 2024 13:29:37 +0200 Subject: [PATCH 03/33] correct tensor patch --- ggml/src/ggml.c | 4 ++-- src/llama.cpp | 33 ++++++++++++--------------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index bc91ac3a726ab..2093be2a98013 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "digraph G {\n"); fprintf(fp, " newrank = true;\n"); - fprintf(fp, " rankdir = LR;\n"); + fprintf(fp, " rankdir = TB;\n"); for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; @@ -19401,7 +19401,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph } fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); - if (ggml_nelements(node) < 5) { + if (ggml_nelements(node) < 5 && node->data != NULL) { fprintf(fp, " | ("); for (int j = 0; j < ggml_nelements(node); j++) { if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { diff --git a/src/llama.cpp b/src/llama.cpp index d97eb3bb2fc63..1c7f6650a9c47 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18314,7 +18314,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co if (il == 0) n_tensors_per_layer++; } } - printf("n_tensors_per_layer %d\n", n_tensors_per_layer); + // printf("n_tensors_per_layer %d\n", n_tensors_per_layer); // count layer buffer types std::map buft_tensor_count; @@ -18363,6 +18363,8 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co } else { ab_map[name].b = cur; } + } else { + LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name); } } @@ -18400,14 +18402,14 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co adapter.bufs.reserve(ctx_map.size()); for (auto it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; - ggml_context * ctx = it.second; - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + ggml_context * ctx_dev = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft); if (!buf) { LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__); return -1; } ggml_backend_buffer_clear(buf, 0); - adapter.ctxs.push_back(ctx); + adapter.ctxs.push_back(ctx_dev); adapter.bufs.push_back(buf); } } @@ -18424,8 +18426,8 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co } gguf_file.seek(offs, SEEK_SET); gguf_file.read_raw(read_buf.data(), size); - // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, orig->name, size); - return ggml_backend_tensor_set(dev, read_buf.data(), 0, size); + // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size); + ggml_backend_tensor_set(dev, read_buf.data(), 0, size); }; for (auto & it : adapter.ab_map) { auto orig = ab_map[it.first]; @@ -18461,6 +18463,7 @@ static int32_t llama_lora_restore_tensors(struct llama_context & lctx) { model.layers[il] = model.orig_layers[il]; // copy } } + return 0; } static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build) { @@ -18498,8 +18501,8 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml cur = ggml_add(ctx_build, cur, *tensor); // TODO: scale ggml_format_name(cur, "%s.merged", name.c_str()); - // LLAMA_LOG_INFO("LORA %s\n", cur->name); - tensor = &cur; + // LLAMA_LOG_INFO("LORA %p %s\n", cur, cur->name); + *tensor = cur; } }; for (auto adapter : lctx.lora_adapters) { @@ -18541,14 +18544,6 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml patch_tensor(adapter, &layer.wq_b); patch_tensor(adapter, &layer.wkv_a_mqa); patch_tensor(adapter, &layer.wkv_b); - patch_tensor(adapter, &layer.wq_cross); - patch_tensor(adapter, &layer.wk_cross); - patch_tensor(adapter, &layer.wv_cross); - patch_tensor(adapter, &layer.wo_cross); - patch_tensor(adapter, &layer.wq_enc); - patch_tensor(adapter, &layer.wk_enc); - patch_tensor(adapter, &layer.wv_enc); - patch_tensor(adapter, &layer.wo_enc); patch_tensor(adapter, &layer.bq); patch_tensor(adapter, &layer.bk); @@ -18556,10 +18551,6 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml patch_tensor(adapter, &layer.bo); patch_tensor(adapter, &layer.bqkv); - patch_tensor(adapter, &layer.attn_rel_b); - patch_tensor(adapter, &layer.attn_rel_b_enc); - patch_tensor(adapter, &layer.attn_rel_b_cross); - patch_tensor(adapter, &layer.ffn_norm); patch_tensor(adapter, &layer.ffn_norm_b); patch_tensor(adapter, &layer.ffn_post_norm); @@ -18578,7 +18569,7 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml patch_tensor(adapter, &layer.ffn_gate_inp); patch_tensor(adapter, &layer.ffn_gate_exps); patch_tensor(adapter, &layer.ffn_down_exps); - patch_tensor(adapter, &layer.ffn_up_exps ); + patch_tensor(adapter, &layer.ffn_up_exps); patch_tensor(adapter, &layer.ffn_gate_inp_shexp); patch_tensor(adapter, &layer.ffn_gate_shexp); From 1b4ffbac4720cd9bee0bc0422df927a1ff1dc22f Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 6 Jul 2024 14:24:56 +0200 Subject: [PATCH 04/33] llama_lora_adapter_apply --- common/common.cpp | 3 +- ggml/src/ggml.c | 2 +- include/llama.h | 6 +- src/llama.cpp | 253 +++++++++++++++++++++------------------------- 4 files changed, 122 insertions(+), 142 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d3eec6aa783b3..d5dd4d38d3cf0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2063,13 +2063,14 @@ std::tuple llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); - auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str()); + auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str(), lora_scale); if (adapter == nullptr) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); llama_free_model(model); return std::make_tuple(nullptr, nullptr); } + llama_lora_adapter_apply(lctx, adapter); } if (params.ignore_eos) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2093be2a98013..2e09b7087e667 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "digraph G {\n"); fprintf(fp, " newrank = true;\n"); - fprintf(fp, " rankdir = TB;\n"); + fprintf(fp, " rankdir = LR;\n"); for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; diff --git a/include/llama.h b/include/llama.h index 077d902837c49..50ea0d84773bf 100644 --- a/include/llama.h +++ b/include/llama.h @@ -515,7 +515,11 @@ extern "C" { // will be applied on top of the previous one LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( struct llama_context * ctx, - const char * path_lora); + const char * path_lora, + float scale); + LLAMA_API int32_t llama_lora_adapter_apply( + struct llama_context * ctx, + struct llama_lora_adapter * adapter); // Apply a loaded control vector to a llama_context, or if data is NULL, clear // the currently loaded vector. diff --git a/src/llama.cpp b/src/llama.cpp index 1c7f6650a9c47..de3d77485c0c2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2559,6 +2559,7 @@ struct llama_lora_adapter { std::map ab_map; std::vector ctxs; std::vector bufs; + float scale = 1.0f; ~llama_lora_adapter() { for (struct ggml_context * ctx : ctxs) { @@ -13495,10 +13496,6 @@ static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) { return result; } -// forward declaration -static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build); -static int32_t llama_lora_restore_tensors(struct llama_context & lctx); - static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch, @@ -13542,11 +13539,6 @@ static struct ggml_cgraph * llama_build_graph( llm.init(); - if (!lctx.lora_adapters.empty()) { - llama_lora_restore_tensors(lctx); - llama_lora_patch_tensors(lctx, llm.ctx0); - } - switch (model.arch) { case LLM_ARCH_LLAMA: { @@ -18444,144 +18436,126 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co return 0; } -static int32_t llama_lora_restore_tensors(struct llama_context & lctx) { - // TODO @ngxson : not ideal, but "const" is discarded to make it work - struct llama_model & model = const_cast(lctx.model); - if (!model.orig_tensors.empty()) { - size_t i = 0; - model.tok_embd = model.orig_tensors[i++]; - model.type_embd = model.orig_tensors[i++]; - model.pos_embd = model.orig_tensors[i++]; - model.tok_norm = model.orig_tensors[i++]; - model.tok_norm_b = model.orig_tensors[i++]; - model.output_norm = model.orig_tensors[i++]; - model.output_norm_b = model.orig_tensors[i++]; - model.output = model.orig_tensors[i++]; - model.output_b = model.orig_tensors[i++]; - model.output_norm_enc = model.orig_tensors[i++]; - for (size_t il = 0; il < model.orig_layers.size(); il++) { - model.layers[il] = model.orig_layers[il]; // copy - } - } - return 0; -} +int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_adapter * adapter) { + GGML_ASSERT(!lctx->lora_adapters.empty()); + const struct llama_model & model = lctx->model; + struct ggml_init_params ctx0_params = { + /*.mem_size =*/ lctx->buf_compute_meta.size(), + /*.mem_buffer =*/ lctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx0 = ggml_init(ctx0_params); -static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build) { - GGML_ASSERT(!lctx.lora_adapters.empty()); - // TODO @ngxson : not ideal, but "const" is discarded to make it work - struct llama_model & model = const_cast(lctx.model); - - // save all original tensors - if (model.orig_tensors.empty()) { - model.orig_tensors.push_back(model.tok_embd); - model.orig_tensors.push_back(model.type_embd); - model.orig_tensors.push_back(model.pos_embd); - model.orig_tensors.push_back(model.tok_norm); - model.orig_tensors.push_back(model.tok_norm_b); - model.orig_tensors.push_back(model.output_norm); - model.orig_tensors.push_back(model.output_norm_b); - model.orig_tensors.push_back(model.output); - model.orig_tensors.push_back(model.output_b); - model.orig_tensors.push_back(model.output_norm_enc); - model.orig_layers.reserve(model.layers.size()); - for (llama_layer layer : model.layers) { - model.orig_layers.push_back(layer); // copy - } - } - - // patch tensors - auto patch_tensor = [&](struct llama_lora_adapter * adapter, struct ggml_tensor ** tensor) { - if (*tensor == nullptr) { + // apply lora for model tensors + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + std::vector> output_nodes; + auto apply_lora = [&](struct llama_lora_adapter * adapter, struct ggml_tensor * model_tensor) { + if (model_tensor == nullptr) { return; } - std::string name = ggml_get_name(*tensor); + std::string name = ggml_get_name(model_tensor); if (adapter->ab_map.find(name) != adapter->ab_map.end()) { auto lora_w = adapter->ab_map[name]; - struct ggml_tensor * cur = ggml_mul_mat(ctx_build, lora_w.a, lora_w.b); - cur = ggml_add(ctx_build, cur, *tensor); - // TODO: scale + struct ggml_tensor * cur = ggml_mul_mat(ctx0, lora_w.a, lora_w.b); + cur = ggml_scale_inplace(ctx0, cur, adapter->scale); + cur = ggml_add(ctx0, cur, model_tensor); ggml_format_name(cur, "%s.merged", name.c_str()); - // LLAMA_LOG_INFO("LORA %p %s\n", cur, cur->name); - *tensor = cur; + ggml_build_forward_expand(gf, cur); + output_nodes.push_back({model_tensor, cur}); } }; - for (auto adapter : lctx.lora_adapters) { - patch_tensor(adapter, &model.tok_embd); - patch_tensor(adapter, &model.type_embd); - patch_tensor(adapter, &model.pos_embd); - patch_tensor(adapter, &model.tok_norm); - patch_tensor(adapter, &model.tok_norm_b); - patch_tensor(adapter, &model.output_norm); - patch_tensor(adapter, &model.output_norm_b); - patch_tensor(adapter, &model.output); - patch_tensor(adapter, &model.output_b); - patch_tensor(adapter, &model.output_norm_enc); - for (llama_layer & layer : model.layers) { - patch_tensor(adapter, &layer.attn_norm); - patch_tensor(adapter, &layer.attn_norm_b); - patch_tensor(adapter, &layer.attn_norm_2); - patch_tensor(adapter, &layer.attn_norm_2_b); - patch_tensor(adapter, &layer.attn_q_norm); - patch_tensor(adapter, &layer.attn_q_norm_b); - patch_tensor(adapter, &layer.attn_k_norm); - patch_tensor(adapter, &layer.attn_k_norm_b); - patch_tensor(adapter, &layer.attn_out_norm); - patch_tensor(adapter, &layer.attn_out_norm_b); - patch_tensor(adapter, &layer.attn_q_a_norm); - patch_tensor(adapter, &layer.attn_kv_a_norm); - patch_tensor(adapter, &layer.attn_sub_norm); - patch_tensor(adapter, &layer.attn_post_norm); - patch_tensor(adapter, &layer.ffn_sub_norm); - patch_tensor(adapter, &layer.attn_norm_cross); - patch_tensor(adapter, &layer.attn_norm_enc); - - patch_tensor(adapter, &layer.wq); - patch_tensor(adapter, &layer.wk); - patch_tensor(adapter, &layer.wv); - patch_tensor(adapter, &layer.wo); - patch_tensor(adapter, &layer.wqkv); - patch_tensor(adapter, &layer.wq_a); - patch_tensor(adapter, &layer.wq_b); - patch_tensor(adapter, &layer.wkv_a_mqa); - patch_tensor(adapter, &layer.wkv_b); - - patch_tensor(adapter, &layer.bq); - patch_tensor(adapter, &layer.bk); - patch_tensor(adapter, &layer.bv); - patch_tensor(adapter, &layer.bo); - patch_tensor(adapter, &layer.bqkv); - - patch_tensor(adapter, &layer.ffn_norm); - patch_tensor(adapter, &layer.ffn_norm_b); - patch_tensor(adapter, &layer.ffn_post_norm); - patch_tensor(adapter, &layer.layer_out_norm); - patch_tensor(adapter, &layer.layer_out_norm_b); - patch_tensor(adapter, &layer.ffn_norm_exps); - patch_tensor(adapter, &layer.ffn_norm_enc); - - patch_tensor(adapter, &layer.ffn_gate); - patch_tensor(adapter, &layer.ffn_down); - patch_tensor(adapter, &layer.ffn_up); - patch_tensor(adapter, &layer.ffn_gate_enc); - patch_tensor(adapter, &layer.ffn_down_enc); - patch_tensor(adapter, &layer.ffn_up_enc); - - patch_tensor(adapter, &layer.ffn_gate_inp); - patch_tensor(adapter, &layer.ffn_gate_exps); - patch_tensor(adapter, &layer.ffn_down_exps); - patch_tensor(adapter, &layer.ffn_up_exps); - - patch_tensor(adapter, &layer.ffn_gate_inp_shexp); - patch_tensor(adapter, &layer.ffn_gate_shexp); - patch_tensor(adapter, &layer.ffn_down_shexp); - patch_tensor(adapter, &layer.ffn_up_shexp); - - patch_tensor(adapter, &layer.ffn_gate_b); - patch_tensor(adapter, &layer.ffn_down_b); - patch_tensor(adapter, &layer.ffn_up_b); - patch_tensor(adapter, &layer.ffn_act); - } + apply_lora(adapter, model.tok_embd); + apply_lora(adapter, model.type_embd); + apply_lora(adapter, model.pos_embd); + apply_lora(adapter, model.tok_norm); + apply_lora(adapter, model.tok_norm_b); + apply_lora(adapter, model.output_norm); + apply_lora(adapter, model.output_norm_b); + apply_lora(adapter, model.output); + apply_lora(adapter, model.output_b); + apply_lora(adapter, model.output_norm_enc); + for (const llama_layer & layer : model.layers) { + apply_lora(adapter, layer.attn_norm); + apply_lora(adapter, layer.attn_norm_b); + apply_lora(adapter, layer.attn_norm_2); + apply_lora(adapter, layer.attn_norm_2_b); + apply_lora(adapter, layer.attn_q_norm); + apply_lora(adapter, layer.attn_q_norm_b); + apply_lora(adapter, layer.attn_k_norm); + apply_lora(adapter, layer.attn_k_norm_b); + apply_lora(adapter, layer.attn_out_norm); + apply_lora(adapter, layer.attn_out_norm_b); + apply_lora(adapter, layer.attn_q_a_norm); + apply_lora(adapter, layer.attn_kv_a_norm); + apply_lora(adapter, layer.attn_sub_norm); + apply_lora(adapter, layer.attn_post_norm); + apply_lora(adapter, layer.ffn_sub_norm); + apply_lora(adapter, layer.attn_norm_cross); + apply_lora(adapter, layer.attn_norm_enc); + + apply_lora(adapter, layer.wq); + apply_lora(adapter, layer.wk); + apply_lora(adapter, layer.wv); + apply_lora(adapter, layer.wo); + apply_lora(adapter, layer.wqkv); + apply_lora(adapter, layer.wq_a); + apply_lora(adapter, layer.wq_b); + apply_lora(adapter, layer.wkv_a_mqa); + apply_lora(adapter, layer.wkv_b); + + apply_lora(adapter, layer.bq); + apply_lora(adapter, layer.bk); + apply_lora(adapter, layer.bv); + apply_lora(adapter, layer.bo); + apply_lora(adapter, layer.bqkv); + + apply_lora(adapter, layer.ffn_norm); + apply_lora(adapter, layer.ffn_norm_b); + apply_lora(adapter, layer.ffn_post_norm); + apply_lora(adapter, layer.layer_out_norm); + apply_lora(adapter, layer.layer_out_norm_b); + apply_lora(adapter, layer.ffn_norm_exps); + apply_lora(adapter, layer.ffn_norm_enc); + + apply_lora(adapter, layer.ffn_gate); + apply_lora(adapter, layer.ffn_down); + apply_lora(adapter, layer.ffn_up); + apply_lora(adapter, layer.ffn_gate_enc); + apply_lora(adapter, layer.ffn_down_enc); + apply_lora(adapter, layer.ffn_up_enc); + + apply_lora(adapter, layer.ffn_gate_inp); + apply_lora(adapter, layer.ffn_gate_exps); + apply_lora(adapter, layer.ffn_down_exps); + apply_lora(adapter, layer.ffn_up_exps); + + apply_lora(adapter, layer.ffn_gate_inp_shexp); + apply_lora(adapter, layer.ffn_gate_shexp); + apply_lora(adapter, layer.ffn_down_shexp); + apply_lora(adapter, layer.ffn_up_shexp); + + apply_lora(adapter, layer.ffn_gate_b); + apply_lora(adapter, layer.ffn_down_b); + apply_lora(adapter, layer.ffn_up_b); + apply_lora(adapter, layer.ffn_act); + } + + // merge lora to model weight + ggml_status res = ggml_backend_sched_graph_compute(lctx->sched, gf); + if (res == GGML_STATUS_SUCCESS) { + for (auto & out : output_nodes) { + struct ggml_tensor * model_tensor = out.first; + struct ggml_tensor * merged_tensor = out.second; + ggml_backend_tensor_copy(merged_tensor, model_tensor); + ggml_set_name(model_tensor, merged_tensor->name); + } + LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, output_nodes.size()); + } else { + LLAMA_LOG_ERROR("%s: compute error while merging lora weights to model, result = %d\n", __func__, res); + return res; } + + ggml_free(ctx0); return 0; } @@ -19362,9 +19336,10 @@ uint32_t llama_model_quantize( } } -struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora) { +struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora, float scale) { try { struct llama_lora_adapter * adapter = new llama_lora_adapter; + adapter->scale = scale; int res = llama_lora_adapter_init_internal(ctx->model, path_lora, *adapter); if (res == 0) { ctx->lora_adapters.push_back(adapter); From b88ce0f8927427929e25f45a419623a55ca043f4 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 6 Jul 2024 15:06:32 +0200 Subject: [PATCH 05/33] correct ggml_backend_tensor_copy --- src/llama.cpp | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index de3d77485c0c2..5f02106d366a2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18446,9 +18446,10 @@ int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_ }; struct ggml_context * ctx0 = ggml_init(ctx0_params); + // map "merged.%s" name to model tensor + std::map output_map; // apply lora for model tensors struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - std::vector> output_nodes; auto apply_lora = [&](struct llama_lora_adapter * adapter, struct ggml_tensor * model_tensor) { if (model_tensor == nullptr) { return; @@ -18459,9 +18460,9 @@ int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_ struct ggml_tensor * cur = ggml_mul_mat(ctx0, lora_w.a, lora_w.b); cur = ggml_scale_inplace(ctx0, cur, adapter->scale); cur = ggml_add(ctx0, cur, model_tensor); - ggml_format_name(cur, "%s.merged", name.c_str()); + ggml_format_name(cur, "merged.%s", name.c_str()); ggml_build_forward_expand(gf, cur); - output_nodes.push_back({model_tensor, cur}); + output_map[std::string(cur->name)] = model_tensor; } }; apply_lora(adapter, model.tok_embd); @@ -18543,13 +18544,19 @@ int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_ // merge lora to model weight ggml_status res = ggml_backend_sched_graph_compute(lctx->sched, gf); if (res == GGML_STATUS_SUCCESS) { - for (auto & out : output_nodes) { - struct ggml_tensor * model_tensor = out.first; - struct ggml_tensor * merged_tensor = out.second; - ggml_backend_tensor_copy(merged_tensor, model_tensor); - ggml_set_name(model_tensor, merged_tensor->name); - } - LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, output_nodes.size()); + // graph maybe realloc, we need to find correct gf->nodes based on name + size_t n_merged = 0; + for (int i = 0; i < gf->n_nodes; ++i) { + auto node = gf->nodes[i]; + std::string name(node->name); + if (output_map.find(name) != output_map.end()) { + struct ggml_tensor * model_tensor = output_map[name]; + ggml_backend_tensor_copy(node, model_tensor); + n_merged++; + } + } + GGML_ASSERT(n_merged == output_map.size()); + LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, n_merged); } else { LLAMA_LOG_ERROR("%s: compute error while merging lora weights to model, result = %d\n", __func__, res); return res; From f6d090d7de2544be6a508d53630e791d9ce0751f Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 7 Jul 2024 16:01:05 +0200 Subject: [PATCH 06/33] add llm_build_mm --- common/common.cpp | 4 +- ggml/src/ggml.c | 2 +- include/llama.h | 24 ++- src/llama.cpp | 467 ++++++++++++++++++++-------------------------- 4 files changed, 220 insertions(+), 277 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d5dd4d38d3cf0..ec5709f83fd5e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2063,14 +2063,14 @@ std::tuple llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); - auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str(), lora_scale); + auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); if (adapter == nullptr) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); llama_free_model(model); return std::make_tuple(nullptr, nullptr); } - llama_lora_adapter_apply(lctx, adapter); + llama_lora_adapter_set(lctx, adapter, lora_scale); } if (params.ignore_eos) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2e09b7087e667..2093be2a98013 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "digraph G {\n"); fprintf(fp, " newrank = true;\n"); - fprintf(fp, " rankdir = LR;\n"); + fprintf(fp, " rankdir = TB;\n"); for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; diff --git a/include/llama.h b/include/llama.h index 50ea0d84773bf..37140b7714788 100644 --- a/include/llama.h +++ b/include/llama.h @@ -508,19 +508,29 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); - // Apply a LoRA adapter to a loaded model - // path_base_model is the path to a higher quality model to use as a base for - // the layers modified by the adapter. Can be NULL to use the current loaded model. - // The model needs to be reloaded before applying a new adapter, otherwise the adapter - // will be applied on top of the previous one + // Load a LoRA adapter from file + // The loaded adapter will be associated to the given model, and will be free when the model is deleted LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( + struct llama_model * model, + const char * path_lora); + + // Add a loaded LoRA adapter to given context + // This will not modify model's weight + LLAMA_API int32_t llama_lora_adapter_set( struct llama_context * ctx, - const char * path_lora, + struct llama_lora_adapter * adapter, float scale); - LLAMA_API int32_t llama_lora_adapter_apply( + + // Remove a LoRA adapter from given context + // Return -1 if the adapter is not present in the context + LLAMA_API int32_t llama_lora_adapter_remove( struct llama_context * ctx, struct llama_lora_adapter * adapter); + // Manually free a LoRA adapter + // Note: loaded adapters will be free when the associated model is deleted + LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter); + // Apply a loaded control vector to a llama_context, or if data is NULL, clear // the currently loaded vector. // n_embd should be the size of a single layer's control, and data should point diff --git a/src/llama.cpp b/src/llama.cpp index 5f02106d366a2..ee18ca847fde5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2547,30 +2547,6 @@ struct llama_control_vector { } }; -struct lora_weight { - struct ggml_tensor * a = nullptr; - struct ggml_tensor * b = nullptr; - lora_weight() {} - lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} -}; - -struct llama_lora_adapter { - // map tensor name to lora_a_b - std::map ab_map; - std::vector ctxs; - std::vector bufs; - float scale = 1.0f; - - ~llama_lora_adapter() { - for (struct ggml_context * ctx : ctxs) { - ggml_free(ctx); - } - for (ggml_backend_buffer_t buf : bufs) { - ggml_backend_buffer_free(buf); - } - } -}; - struct llama_vocab { using id = int32_t; using token = std::string; @@ -2703,9 +2679,8 @@ struct llama_model { int64_t t_load_us = 0; int64_t t_start_us = 0; - // used by lora, to save model's original tensors - std::vector orig_tensors; - std::vector orig_layers; + // keep track of loaded lora adapters + std::set lora_adapters; ~llama_model() { for (struct ggml_context * ctx : ctxs) { @@ -2719,6 +2694,9 @@ struct llama_model { #endif ggml_backend_buffer_free(buf); } + while (!lora_adapters.empty()) { + llama_lora_adapter_free(*lora_adapters.begin()); + } } }; @@ -2732,10 +2710,6 @@ struct llama_context { } ggml_backend_buffer_free(buf_output); - - for (auto adapter : lora_adapters) { - delete adapter; - } } llama_cparams cparams; @@ -2828,8 +2802,50 @@ struct llama_context { // control vectors struct llama_control_vector cvec; - // lora adapters - std::vector lora_adapters; + // lora adapters and scales + std::map lora_adapters; +}; + +struct lora_weight { + struct ggml_tensor * a = nullptr; + struct ggml_tensor * b = nullptr; + lora_weight() {} + lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} +}; + +struct llama_lora_adapter { + struct llama_model * base_model; + // map tensor name to lora_a_b + std::map ab_map; + std::vector ctxs; + std::vector bufs; + + llama_lora_adapter(struct llama_model * base_model): base_model(base_model) { + base_model->lora_adapters.insert(this); + } + + bool has_weight(struct ggml_tensor * w) { + std::string name(w->name); + return ab_map.find(name) != ab_map.end(); + } + + lora_weight & get_weight(struct ggml_tensor * w) { + std::string name(w->name); + return ab_map.at(name); + } + + ~llama_lora_adapter() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + auto pos = base_model->lora_adapters.find(this); + if (pos != base_model->lora_adapters.end()) { + base_model->lora_adapters.erase(pos); + } + } }; static size_t llama_get_device_count(const llama_model & model) { @@ -7773,6 +7789,32 @@ static void llm_build_kv_store( ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view)); } +// do mat_mul, while optionally apply lora +static struct ggml_tensor * llm_build_mm( + struct llama_context & lctx, + struct ggml_context * ctx0, + struct ggml_tensor * w, + struct ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + for (auto & it : lctx.lora_adapters) { + struct llama_lora_adapter * adapter = it.first; + float scale = it.second; + if (!adapter->has_weight(w)) { + continue; + } + struct lora_weight & lora = adapter->get_weight(w); + // TODO: check if lora_a need transpose + struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora.a)); + struct ggml_tensor * ab_cur = ggml_mul_mat( + ctx0, lora.b, + ggml_mul_mat(ctx0, a, cur) + ); + ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + return res; +} + static struct ggml_tensor * llm_build_norm( struct ggml_context * ctx, struct ggml_tensor * cur, @@ -7806,6 +7848,7 @@ static struct ggml_tensor * llm_build_norm( } static struct ggml_tensor * llm_build_ffn( + struct llama_context & lctx, struct ggml_context * ctx, struct ggml_tensor * cur, struct ggml_tensor * up, @@ -7822,7 +7865,7 @@ static struct ggml_tensor * llm_build_ffn( llm_ffn_gate_type type_gate, const llm_build_cb & cb, int il) { - struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur; + struct ggml_tensor * tmp = up ? llm_build_mm(lctx, ctx, up, cur) : cur; cb(tmp, "ffn_up", il); if (up_b) { @@ -7839,12 +7882,12 @@ static struct ggml_tensor * llm_build_ffn( switch (type_gate) { case LLM_FFN_SEQ: { - cur = ggml_mul_mat(ctx, gate, tmp); + cur = llm_build_mm(lctx, ctx, gate, tmp); cb(cur, "ffn_gate", il); } break; case LLM_FFN_PAR: { - cur = ggml_mul_mat(ctx, gate, cur); + cur = llm_build_mm(lctx, ctx, gate, cur); cb(cur, "ffn_gate", il); } break; } @@ -7899,7 +7942,7 @@ static struct ggml_tensor * llm_build_ffn( } if (down) { - cur = ggml_mul_mat(ctx, down, cur); + cur = llm_build_mm(lctx, ctx, down, cur); } if (down_b) { @@ -7919,6 +7962,7 @@ static struct ggml_tensor * llm_build_ffn( } static struct ggml_tensor * llm_build_moe_ffn( + struct llama_context & lctx, struct ggml_context * ctx, struct ggml_tensor * cur, struct ggml_tensor * gate_inp, @@ -7936,7 +7980,7 @@ static struct ggml_tensor * llm_build_moe_ffn( int64_t n_embd = cur->ne[0]; int64_t n_tokens = cur->ne[1]; - ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens] + ggml_tensor * logits = llm_build_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens] cb(logits, "ffn_moe_logits", il); ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] @@ -8019,6 +8063,7 @@ static struct ggml_tensor * llm_build_moe_ffn( } static struct ggml_tensor * llm_build_kqv( + struct llama_context & lctx, struct ggml_context * ctx, const llama_model & model, const llama_hparams & hparams, @@ -8076,7 +8121,7 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { - struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + struct ggml_tensor * kq = llm_build_mm(lctx, ctx, k, q); cb(kq, "kq", il); if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { @@ -8119,7 +8164,7 @@ static struct ggml_tensor * llm_build_kqv( 0); cb(v, "v", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); + struct ggml_tensor * kqv = llm_build_mm(lctx, ctx, v, kq); cb(kqv, "kqv", il); struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); @@ -8132,7 +8177,7 @@ static struct ggml_tensor * llm_build_kqv( ggml_build_forward_expand(graph, cur); if (wo) { - cur = ggml_mul_mat(ctx, wo, cur); + cur = llm_build_mm(lctx, ctx, wo, cur); } if (wo_b) { @@ -8147,6 +8192,7 @@ static struct ggml_tensor * llm_build_kqv( } static struct ggml_tensor * llm_build_kv( + struct llama_context & lctx, struct ggml_context * ctx, const llama_model & model, const llama_hparams & hparams, @@ -8176,7 +8222,7 @@ static struct ggml_tensor * llm_build_kv( struct ggml_tensor * cur; - cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b, + cur = llm_build_kqv(lctx, ctx, model, hparams, cparams, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il); cb(cur, "kqv_out", il); @@ -8638,21 +8684,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -8673,7 +8719,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8696,7 +8742,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -8710,7 +8756,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -8740,7 +8786,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -8808,7 +8854,7 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8830,7 +8876,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -8913,7 +8959,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8935,7 +8981,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9034,7 +9080,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9051,7 +9097,7 @@ struct llm_build_context { // feed forward { - cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result + cur = llm_build_ffn(lctx, ctx0, attn_norm, // !! use the attn norm, not the result model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9158,7 +9204,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -9190,7 +9236,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -9308,7 +9354,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9331,7 +9377,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "attn_out_norm", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -9418,7 +9464,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9442,7 +9488,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -9512,7 +9558,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9534,7 +9580,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9719,21 +9765,21 @@ struct llm_build_context { // feed-forward network if (model.arch == LLM_ARCH_BERT) { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, cb, il); } else { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9807,7 +9853,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9831,7 +9877,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -9939,13 +9985,13 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } else { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9969,7 +10015,7 @@ struct llm_build_context { model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -10090,7 +10136,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10118,7 +10164,7 @@ struct llm_build_context { // parallel residual cur = inpSA; } - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10209,7 +10255,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10231,7 +10277,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10323,7 +10369,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10344,7 +10390,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10438,7 +10484,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10461,7 +10507,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, cur, + llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -10481,7 +10527,7 @@ struct llm_build_context { ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); cb(cur_gate, "ffn_shexp_gate", il); - ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur, + ggml_tensor * cur_ffn = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -10595,7 +10641,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -10610,7 +10656,7 @@ struct llm_build_context { // FF { - ffn_output = llm_build_ffn(ctx0, attn_norm_output, + ffn_output = llm_build_ffn(lctx, ctx0, attn_norm_output, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -10715,7 +10761,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -10830,7 +10876,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10848,7 +10894,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10932,7 +10978,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10956,7 +11002,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -11043,7 +11089,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11067,7 +11113,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -11163,7 +11209,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11184,7 +11230,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11281,7 +11327,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11302,7 +11348,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11412,7 +11458,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11439,7 +11485,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11534,7 +11580,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -11556,7 +11602,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11647,7 +11693,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -11674,7 +11720,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11784,7 +11830,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11806,7 +11852,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -12077,7 +12123,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12094,7 +12140,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, ffn_inp, + cur = llm_build_ffn(lctx, ctx0, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12209,7 +12255,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12231,7 +12277,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12336,7 +12382,7 @@ struct llm_build_context { Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); cb(Qcur, "Vcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12358,7 +12404,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12445,7 +12491,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12470,7 +12516,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -12501,7 +12547,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -12588,7 +12634,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12610,7 +12656,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12627,7 +12673,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm_exps", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -12810,7 +12856,7 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } @@ -12832,7 +12878,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12847,7 +12893,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, cur, + llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -12860,7 +12906,7 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur, + ggml_tensor * ffn_shexp = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -12965,7 +13011,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, NULL, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); @@ -12998,7 +13044,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, NULL, NULL, NULL, @@ -13132,7 +13178,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up_enc, NULL, NULL, model.layers[il].ffn_gate_enc, NULL, NULL, model.layers[il].ffn_down_enc, NULL, NULL, @@ -13310,7 +13356,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -13392,7 +13438,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); } @@ -13416,7 +13462,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -18278,7 +18324,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } -static int llama_lora_adapter_init_internal(const struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) { +static int llama_lora_adapter_init_internal(const struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { static const int n_inp_tensors = 5; // see llama_model static const int n_out_tensors = 5; // see llama_model LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); @@ -18310,11 +18356,11 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co // count layer buffer types std::map buft_tensor_count; - for (int64_t i = 0; i < model.hparams.n_layer; i++) { - buft_tensor_count[model.buft_layer[i].buft] += n_tensors_per_layer; + for (int64_t i = 0; i < model->hparams.n_layer; i++) { + buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer; } - buft_tensor_count[model.buft_input.buft] += n_inp_tensors; - buft_tensor_count[model.buft_output.buft] += n_out_tensors; + buft_tensor_count[model->buft_input.buft] += n_inp_tensors; + buft_tensor_count[model->buft_output.buft] += n_out_tensors; // allocate contexts std::map ctx_map; @@ -18371,11 +18417,11 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co sscanf(cname, "blk.%d.", &il); struct ggml_context * dev_ctx; // device ctx if (il >= 0) { - dev_ctx = ctx_map.at(model.buft_layer[il].buft); + dev_ctx = ctx_map.at(model->buft_layer[il].buft); } else if (strstr(cname, "tok") == 0) { - dev_ctx = ctx_map.at(model.buft_input.buft); + dev_ctx = ctx_map.at(model->buft_input.buft); } else if (strstr(cname, "output") == 0) { - dev_ctx = ctx_map.at(model.buft_output.buft); + dev_ctx = ctx_map.at(model->buft_output.buft); } else { LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname); continue; @@ -18436,134 +18482,27 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co return 0; } -int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_adapter * adapter) { - GGML_ASSERT(!lctx->lora_adapters.empty()); - const struct llama_model & model = lctx->model; - struct ggml_init_params ctx0_params = { - /*.mem_size =*/ lctx->buf_compute_meta.size(), - /*.mem_buffer =*/ lctx->buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - struct ggml_context * ctx0 = ggml_init(ctx0_params); - - // map "merged.%s" name to model tensor - std::map output_map; - // apply lora for model tensors - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - auto apply_lora = [&](struct llama_lora_adapter * adapter, struct ggml_tensor * model_tensor) { - if (model_tensor == nullptr) { - return; - } - std::string name = ggml_get_name(model_tensor); - if (adapter->ab_map.find(name) != adapter->ab_map.end()) { - auto lora_w = adapter->ab_map[name]; - struct ggml_tensor * cur = ggml_mul_mat(ctx0, lora_w.a, lora_w.b); - cur = ggml_scale_inplace(ctx0, cur, adapter->scale); - cur = ggml_add(ctx0, cur, model_tensor); - ggml_format_name(cur, "merged.%s", name.c_str()); - ggml_build_forward_expand(gf, cur); - output_map[std::string(cur->name)] = model_tensor; - } - }; - apply_lora(adapter, model.tok_embd); - apply_lora(adapter, model.type_embd); - apply_lora(adapter, model.pos_embd); - apply_lora(adapter, model.tok_norm); - apply_lora(adapter, model.tok_norm_b); - apply_lora(adapter, model.output_norm); - apply_lora(adapter, model.output_norm_b); - apply_lora(adapter, model.output); - apply_lora(adapter, model.output_b); - apply_lora(adapter, model.output_norm_enc); - for (const llama_layer & layer : model.layers) { - apply_lora(adapter, layer.attn_norm); - apply_lora(adapter, layer.attn_norm_b); - apply_lora(adapter, layer.attn_norm_2); - apply_lora(adapter, layer.attn_norm_2_b); - apply_lora(adapter, layer.attn_q_norm); - apply_lora(adapter, layer.attn_q_norm_b); - apply_lora(adapter, layer.attn_k_norm); - apply_lora(adapter, layer.attn_k_norm_b); - apply_lora(adapter, layer.attn_out_norm); - apply_lora(adapter, layer.attn_out_norm_b); - apply_lora(adapter, layer.attn_q_a_norm); - apply_lora(adapter, layer.attn_kv_a_norm); - apply_lora(adapter, layer.attn_sub_norm); - apply_lora(adapter, layer.attn_post_norm); - apply_lora(adapter, layer.ffn_sub_norm); - apply_lora(adapter, layer.attn_norm_cross); - apply_lora(adapter, layer.attn_norm_enc); - - apply_lora(adapter, layer.wq); - apply_lora(adapter, layer.wk); - apply_lora(adapter, layer.wv); - apply_lora(adapter, layer.wo); - apply_lora(adapter, layer.wqkv); - apply_lora(adapter, layer.wq_a); - apply_lora(adapter, layer.wq_b); - apply_lora(adapter, layer.wkv_a_mqa); - apply_lora(adapter, layer.wkv_b); - - apply_lora(adapter, layer.bq); - apply_lora(adapter, layer.bk); - apply_lora(adapter, layer.bv); - apply_lora(adapter, layer.bo); - apply_lora(adapter, layer.bqkv); - - apply_lora(adapter, layer.ffn_norm); - apply_lora(adapter, layer.ffn_norm_b); - apply_lora(adapter, layer.ffn_post_norm); - apply_lora(adapter, layer.layer_out_norm); - apply_lora(adapter, layer.layer_out_norm_b); - apply_lora(adapter, layer.ffn_norm_exps); - apply_lora(adapter, layer.ffn_norm_enc); - - apply_lora(adapter, layer.ffn_gate); - apply_lora(adapter, layer.ffn_down); - apply_lora(adapter, layer.ffn_up); - apply_lora(adapter, layer.ffn_gate_enc); - apply_lora(adapter, layer.ffn_down_enc); - apply_lora(adapter, layer.ffn_up_enc); - - apply_lora(adapter, layer.ffn_gate_inp); - apply_lora(adapter, layer.ffn_gate_exps); - apply_lora(adapter, layer.ffn_down_exps); - apply_lora(adapter, layer.ffn_up_exps); - - apply_lora(adapter, layer.ffn_gate_inp_shexp); - apply_lora(adapter, layer.ffn_gate_shexp); - apply_lora(adapter, layer.ffn_down_shexp); - apply_lora(adapter, layer.ffn_up_shexp); - - apply_lora(adapter, layer.ffn_gate_b); - apply_lora(adapter, layer.ffn_down_b); - apply_lora(adapter, layer.ffn_up_b); - apply_lora(adapter, layer.ffn_act); - } - - // merge lora to model weight - ggml_status res = ggml_backend_sched_graph_compute(lctx->sched, gf); - if (res == GGML_STATUS_SUCCESS) { - // graph maybe realloc, we need to find correct gf->nodes based on name - size_t n_merged = 0; - for (int i = 0; i < gf->n_nodes; ++i) { - auto node = gf->nodes[i]; - std::string name(node->name); - if (output_map.find(name) != output_map.end()) { - struct ggml_tensor * model_tensor = output_map[name]; - ggml_backend_tensor_copy(node, model_tensor); - n_merged++; - } - } - GGML_ASSERT(n_merged == output_map.size()); - LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, n_merged); - } else { - LLAMA_LOG_ERROR("%s: compute error while merging lora weights to model, result = %d\n", __func__, res); - return res; +int32_t llama_lora_adapter_set( + struct llama_context * ctx, + struct llama_lora_adapter * adapter, + float scale) { + ctx->lora_adapters[adapter] = scale; + return 0; +} + +int32_t llama_lora_adapter_remove( + struct llama_context * ctx, + struct llama_lora_adapter * adapter) { + auto pos = ctx->lora_adapters.find(adapter); + if (pos != ctx->lora_adapters.end()) { + ctx->lora_adapters.erase(pos); + return 0; } + return -1; +} - ggml_free(ctx0); - return 0; +void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { + delete adapter; } // @@ -19343,17 +19282,11 @@ uint32_t llama_model_quantize( } } -struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora, float scale) { +struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) { try { - struct llama_lora_adapter * adapter = new llama_lora_adapter; - adapter->scale = scale; - int res = llama_lora_adapter_init_internal(ctx->model, path_lora, *adapter); - if (res == 0) { - ctx->lora_adapters.push_back(adapter); - return adapter; - } else { - return nullptr; - } + struct llama_lora_adapter * adapter = new llama_lora_adapter(model); + int res = llama_lora_adapter_init_internal(model, path_lora, *adapter); + return res == 0 ? adapter : nullptr; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return nullptr; From 30faf1f3def8ce627225f2401fb403d95907a47d Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 7 Jul 2024 16:36:50 +0200 Subject: [PATCH 07/33] fix auto merge --- src/llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 7c79e4900dfca..ffc8ffbd23740 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -10863,7 +10863,7 @@ struct llm_build_context { // special-case: the up and gate tensors are merged into a single tensor // TOOD: support into llm_build_ffn { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -13622,7 +13622,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); @@ -13647,7 +13647,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, From 79e2982788b0102aabb098b1a3d6227a7e32a483 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 8 Jul 2024 11:59:01 +0200 Subject: [PATCH 08/33] update based on review comments --- src/llama.cpp | 106 +++++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index ffc8ffbd23740..a4ceb0959caa2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2821,20 +2821,20 @@ struct llama_context { struct llama_control_vector cvec; // lora adapters and scales - std::map lora_adapters; + std::unordered_map lora_adapters; }; -struct lora_weight { +struct llama_lora_weight { struct ggml_tensor * a = nullptr; struct ggml_tensor * b = nullptr; - lora_weight() {} - lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} + llama_lora_weight() {} + llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} }; struct llama_lora_adapter { struct llama_model * base_model; // map tensor name to lora_a_b - std::map ab_map; + std::unordered_map ab_map; std::vector ctxs; std::vector bufs; @@ -2842,14 +2842,13 @@ struct llama_lora_adapter { base_model->lora_adapters.insert(this); } - bool has_weight(struct ggml_tensor * w) { + llama_lora_weight * get_weight(struct ggml_tensor * w) { std::string name(w->name); - return ab_map.find(name) != ab_map.end(); - } - - lora_weight & get_weight(struct ggml_tensor * w) { - std::string name(w->name); - return ab_map.at(name); + auto pos = ab_map.find(name); + if (ab_map.find(name) != ab_map.end()) { + return &pos->second; + } + return nullptr; } ~llama_lora_adapter() { @@ -7855,23 +7854,22 @@ static void llm_build_kv_store( } // do mat_mul, while optionally apply lora -static struct ggml_tensor * llm_build_mm( +static struct ggml_tensor * llm_build_lora_mm( struct llama_context & lctx, struct ggml_context * ctx0, struct ggml_tensor * w, struct ggml_tensor * cur) { struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); for (auto & it : lctx.lora_adapters) { - struct llama_lora_adapter * adapter = it.first; + struct llama_lora_weight * lora = it.first->get_weight(w); float scale = it.second; - if (!adapter->has_weight(w)) { + if (lora == nullptr) { continue; } - struct lora_weight & lora = adapter->get_weight(w); // TODO: check if lora_a need transpose - struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora.a)); + struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora->a)); struct ggml_tensor * ab_cur = ggml_mul_mat( - ctx0, lora.b, + ctx0, lora->b, ggml_mul_mat(ctx0, a, cur) ); ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale); @@ -7930,7 +7928,7 @@ static struct ggml_tensor * llm_build_ffn( llm_ffn_gate_type type_gate, const llm_build_cb & cb, int il) { - struct ggml_tensor * tmp = up ? llm_build_mm(lctx, ctx, up, cur) : cur; + struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur; cb(tmp, "ffn_up", il); if (up_b) { @@ -7947,12 +7945,12 @@ static struct ggml_tensor * llm_build_ffn( switch (type_gate) { case LLM_FFN_SEQ: { - cur = llm_build_mm(lctx, ctx, gate, tmp); + cur = llm_build_lora_mm(lctx, ctx, gate, tmp); cb(cur, "ffn_gate", il); } break; case LLM_FFN_PAR: { - cur = llm_build_mm(lctx, ctx, gate, cur); + cur = llm_build_lora_mm(lctx, ctx, gate, cur); cb(cur, "ffn_gate", il); } break; } @@ -8020,7 +8018,7 @@ static struct ggml_tensor * llm_build_ffn( } if (down) { - cur = llm_build_mm(lctx, ctx, down, cur); + cur = llm_build_lora_mm(lctx, ctx, down, cur); } if (down_b) { @@ -8058,7 +8056,7 @@ static struct ggml_tensor * llm_build_moe_ffn( int64_t n_embd = cur->ne[0]; int64_t n_tokens = cur->ne[1]; - ggml_tensor * logits = llm_build_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens] + ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens] cb(logits, "ffn_moe_logits", il); ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] @@ -8199,7 +8197,7 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { - struct ggml_tensor * kq = llm_build_mm(lctx, ctx, k, q); + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { @@ -8242,7 +8240,7 @@ static struct ggml_tensor * llm_build_kqv( 0); cb(v, "v", il); - struct ggml_tensor * kqv = llm_build_mm(lctx, ctx, v, kq); + struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); cb(kqv, "kqv", il); struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); @@ -8255,7 +8253,7 @@ static struct ggml_tensor * llm_build_kqv( ggml_build_forward_expand(graph, cur); if (wo) { - cur = llm_build_mm(lctx, ctx, wo, cur); + cur = llm_build_lora_mm(lctx, ctx, wo, cur); } if (wo_b) { @@ -8762,21 +8760,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -8864,7 +8862,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = llm_build_mm(lctx, ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -18517,7 +18515,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } -static int llama_lora_adapter_init_internal(const struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { +static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { static const int n_inp_tensors = 5; // see llama_model static const int n_out_tensors = 5; // see llama_model LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); @@ -18532,7 +18530,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); if (!ctx_gguf) { LLAMA_LOG_ERROR("%s: failed to load lora adapter file from %s\n", __func__, path_lora); - return -1; + throw std::exception(); } // calculate n_tensors_per_layer @@ -18574,7 +18572,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co } // bundle lora_a and lora_b into pairs - std::map ab_map; + std::map ab_map; auto str_endswith = [](const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; }; @@ -18583,18 +18581,19 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co if (str_endswith(name, ".lora_a")) { replace_all(name, ".lora_a", ""); if (ab_map.find(name) == ab_map.end()) { - ab_map[name] = lora_weight(cur, nullptr); + ab_map[name] = llama_lora_weight(cur, nullptr); } else { ab_map[name].a = cur; } } else if (str_endswith(name, ".lora_b")) { replace_all(name, ".lora_b", ""); if (ab_map.find(name) == ab_map.end()) { - ab_map[name] = lora_weight(nullptr, cur); + ab_map[name] = llama_lora_weight(nullptr, cur); } else { ab_map[name].b = cur; } } else { + // maybe "optimizer.*"" tensors LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name); } } @@ -18603,28 +18602,26 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co for (auto & it : ab_map) { std::string name = it.first; const char * cname = name.c_str(); - lora_weight & w = it.second; + llama_lora_weight & w = it.second; GGML_ASSERT(w.a != nullptr); GGML_ASSERT(w.b != nullptr); int il = -1; sscanf(cname, "blk.%d.", &il); - struct ggml_context * dev_ctx; // device ctx - if (il >= 0) { - dev_ctx = ctx_map.at(model->buft_layer[il].buft); - } else if (strstr(cname, "tok") == 0) { - dev_ctx = ctx_map.at(model->buft_input.buft); - } else if (strstr(cname, "output") == 0) { - dev_ctx = ctx_map.at(model->buft_output.buft); - } else { - LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname); - continue; + // device buft and device ctx + auto model_tensor = llama_get_model_tensor(model, cname); + if (!model_tensor) { + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model\n"); } + struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer)); + // TODO: validate tensor shape // LLAMA_LOG_INFO("%s %p %p\n", cname, w.a, w.b); struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); ggml_set_name(tensor_a, w.a->name); ggml_set_name(tensor_b, w.b->name); - adapter.ab_map[name] = lora_weight(tensor_a, tensor_b); + adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); } // allocate tensors / buffers and zero @@ -18636,8 +18633,9 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co ggml_context * ctx_dev = it.second; ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft); if (!buf) { - LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__); - return -1; + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("failed to allocate buffer for lora adapter\n"); } ggml_backend_buffer_clear(buf, 0); adapter.ctxs.push_back(ctx_dev); @@ -18671,14 +18669,18 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2); // free ctx for reading gguf + gguf_free(ctx_gguf); ggml_free(ctx); - return 0; } int32_t llama_lora_adapter_set( struct llama_context * ctx, struct llama_lora_adapter * adapter, float scale) { + if (ctx->cparams.flash_attn) { + LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__); + return -1; + } ctx->lora_adapters[adapter] = scale; return 0; } @@ -19479,8 +19481,8 @@ uint32_t llama_model_quantize( struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) { try { struct llama_lora_adapter * adapter = new llama_lora_adapter(model); - int res = llama_lora_adapter_init_internal(model, path_lora, *adapter); - return res == 0 ? adapter : nullptr; + llama_lora_adapter_init_internal(model, path_lora, *adapter); + return adapter; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return nullptr; From 847135aaa25ae999060ddb8431f5d529f9244389 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 8 Jul 2024 16:35:27 +0200 Subject: [PATCH 09/33] add convert script --- convert_lora_to_gguf.py | 149 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100755 convert_lora_to_gguf.py diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py new file mode 100755 index 0000000000000..9a5c7a2c8f916 --- /dev/null +++ b/convert_lora_to_gguf.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import logging +import argparse +import contextlib +import json +import os +import re +import sys +import types +from enum import IntEnum +from pathlib import Path +from hashlib import sha256 +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast + +import math +import numpy as np +import torch + +if TYPE_CHECKING: + from torch import Tensor + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf + +# reuse model definitions from convert_hf_to_gguf.py +from convert_hf_to_gguf import Model + +logger = logging.getLogger("lora-to-gguf") + +def parse_args() -> argparse.Namespace: + all_models = ", ".join([arch for arch in Model._model_classes.keys()]) + parser = argparse.ArgumentParser( + description="Convert a huggingface model to a GGML compatible file") + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input.", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + ) + parser.add_argument( + "--arch", type=str, + help=f"Arch of the base model, must be one of: {all_models} (default: LlamaForCausalLM)", + default="LlamaForCausalLM" + ) + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) + parser.add_argument( + "--base", type=Path, required=True, + help="directory containing base model file", + ) + parser.add_argument( + "lora_path", type=Path, + help="directory containing LoRA adapter file", + ) + + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + # FIXME: outtype is not working + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + "auto": gguf.LlamaFileType.GUESSED, + } + + dir_base_model = args.base + dir_lora = args.lora_path + input_json = os.path.join(dir_lora, "adapter_config.json") + input_model = os.path.join(dir_lora, "adapter_model.bin") + if args.outfile is not None: + fname_out = args.outfile + else: + # output in the same directory as the model by default + fname_out = dir_lora / 'ggml-lora.gguf' + + if os.path.exists(input_model): + lora_model = torch.load(input_model, map_location="cpu") + else: + input_model = os.path.join(dir_lora, "adapter_model.safetensors") + # lazy import load_file only if lora is in safetensors format. + from safetensors.torch import load_file + lora_model = load_file(input_model, device="cpu") + + # load base model + logger.info(f"Loading base model: {dir_base_model.name}") + hparams = Model.load_hparams(dir_base_model) + with torch.inference_mode(): + try: + model_class = Model.from_model_architecture(hparams["architectures"][0]) + except NotImplementedError: + logger.error(f"Model {hparams['architectures'][0]} is not supported") + sys.exit(1) + + model_instance = model_class(dir_base_model, ftype_map[args.outtype], fname_out, args.bigendian, False, False, None) + logger.info("Set model parameters") + model_instance.set_gguf_parameters() + + # adapter_config = json.load(input_json) + model_instance.gguf_writer.add_string("training.type", "finetune_lora") + + map_tensors: dict[str, Tensor] = {} + for tensor_name, tensor in lora_model.items(): + orig_name = tensor_name.replace("base_model.model.", "") + orig_name = orig_name.replace(".lora_A.weight", ".weight") + orig_name = orig_name.replace(".lora_B.weight", ".weight") + is_lora_a = ".lora_A.weight" in tensor_name + is_lora_b = ".lora_B.weight" in tensor_name + if not is_lora_a and not is_lora_b: + logger.error(f"Unexpected name '{tensor_name}': Not a lora_A or lora_B tensor") + sys.exit(1) + dest_name = model_instance.map_tensor_name(orig_name) + dest_name = f"{dest_name}.lora_a" if is_lora_a else f"{dest_name}.lora_b" + # logger.info(f"{orig_name} --> {dest_name}") + map_tensors[dest_name] = tensor + + # overwrite method + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for name, tensor in map_tensors.items(): + yield (name, tensor) + + # overwrite method + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + return [(name, data_torch)] + + model_instance.get_tensors = types.MethodType(get_tensors, model_instance) + model_instance.modify_tensors = types.MethodType(modify_tensors, model_instance) + model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + logger.info("Exporting model...") + model_instance.write() + logger.info(f"Model successfully exported to {fname_out}") From 712fecba61b803fc324004220d7bb782240dcba6 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 8 Jul 2024 16:48:55 +0200 Subject: [PATCH 10/33] no more transpose A --- src/llama.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index a4ceb0959caa2..b42cc5fb4837d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7867,10 +7867,9 @@ static struct ggml_tensor * llm_build_lora_mm( continue; } // TODO: check if lora_a need transpose - struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora->a)); struct ggml_tensor * ab_cur = ggml_mul_mat( ctx0, lora->b, - ggml_mul_mat(ctx0, a, cur) + ggml_mul_mat(ctx0, lora->a, cur) ); ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale); res = ggml_add(ctx0, res, ab_cur); From 84288ff9f7e945bb730bb0df069ecf2054ba6076 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 8 Jul 2024 17:05:17 +0200 Subject: [PATCH 11/33] add f16 convert --- convert_lora_to_gguf.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 9a5c7a2c8f916..36ccb73cfc333 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -139,10 +139,17 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: # overwrite method def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused return [(name, data_torch)] + # overwrite method + def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del name, new_name, bid, n_dims # unused + return True + model_instance.get_tensors = types.MethodType(get_tensors, model_instance) model_instance.modify_tensors = types.MethodType(modify_tensors, model_instance) + model_instance.extra_f16_tensors = types.MethodType(extra_f16_tensors, model_instance) model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) logger.info("Exporting model...") model_instance.write() From 0e1618898599abe2890469a30305697f7c791a52 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 8 Jul 2024 17:44:14 +0200 Subject: [PATCH 12/33] add metadata check --- src/llama.cpp | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index b42cc5fb4837d..ad11ef4943064 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -371,6 +371,8 @@ enum llm_kv { LLM_KV_TOKENIZER_SUFFIX_ID, LLM_KV_TOKENIZER_MIDDLE_ID, LLM_KV_TOKENIZER_EOT_ID, + + LLM_KV_TRAINING_TYPE, }; static const std::map LLM_KV_NAMES = { @@ -464,6 +466,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + + { LLM_KV_TRAINING_TYPE, "training.type" }, }; struct LLM_KV { @@ -18519,8 +18523,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c static const int n_out_tensors = 5; // see llama_model LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); - // TODO: check lora base model arch - ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { /* .no_alloc = */ false, @@ -18532,6 +18534,25 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c throw std::exception(); } + // check metadata + { + auto get_kv_str = [&](std::string key) -> std::string { + std::vector str_buf(32, 0); // we only get the arch, so no need big buffer here + int id = gguf_find_key(ctx_gguf, key.c_str()); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); + }; + LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); + auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); + auto lora_arch = llm_arch_from_string(lora_arch_name); + if (lora_arch != model->arch) { + throw std::runtime_error("model arch and LoRA arch mismatch"); + } + auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE)); + if (train_type != "finetune_lora") { + throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type); + } + } + // calculate n_tensors_per_layer int n_tensors_per_layer = 0; { @@ -18542,7 +18563,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c if (il == 0) n_tensors_per_layer++; } } - // printf("n_tensors_per_layer %d\n", n_tensors_per_layer); // count layer buffer types std::map buft_tensor_count; From 6c617e20efc2a8020b99ebdbe4721f17c2c34485 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 8 Jul 2024 21:36:35 +0200 Subject: [PATCH 13/33] add sanity check --- src/llama.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index ad11ef4943064..278c7912d4752 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -467,7 +467,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, - { LLM_KV_TRAINING_TYPE, "training.type" }, + { LLM_KV_TRAINING_TYPE, "training.type" }, }; struct LLM_KV { @@ -18521,7 +18521,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { static const int n_inp_tensors = 5; // see llama_model static const int n_out_tensors = 5; // see llama_model - LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); + LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -18530,8 +18530,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c }; struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); if (!ctx_gguf) { - LLAMA_LOG_ERROR("%s: failed to load lora adapter file from %s\n", __func__, path_lora); - throw std::exception(); + throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora)); } // check metadata @@ -18631,11 +18630,17 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c if (!model_tensor) { gguf_free(ctx_gguf); ggml_free(ctx); - throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model\n"); + throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); } struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer)); - // TODO: validate tensor shape - // LLAMA_LOG_INFO("%s %p %p\n", cname, w.a, w.b); + // validate tensor shape + if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { + throw std::runtime_error("tensor '" + name + "' has incorrect shape"); + } + if (w.a->ne[1] != w.b->ne[0]) { + throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); + } + // save tensor to adapter struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); ggml_set_name(tensor_a, w.a->name); From 7a83f200d353db68fef8458017c7db17b0a303c4 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 8 Jul 2024 21:55:41 +0200 Subject: [PATCH 14/33] fix ftype --- convert_lora_to_gguf.py | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 36ccb73cfc333..861ab1e97f536 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -5,19 +5,12 @@ import logging import argparse -import contextlib -import json import os -import re import sys import types -from enum import IntEnum from pathlib import Path -from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast +from typing import TYPE_CHECKING, Iterable, Iterator -import math -import numpy as np import torch if TYPE_CHECKING: @@ -32,22 +25,17 @@ logger = logging.getLogger("lora-to-gguf") + def parse_args() -> argparse.Namespace: - all_models = ", ".join([arch for arch in Model._model_classes.keys()]) parser = argparse.ArgumentParser( - description="Convert a huggingface model to a GGML compatible file") + description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file") parser.add_argument( "--outfile", type=Path, help="path to write to; default: based on input.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", - ) - parser.add_argument( - "--arch", type=str, - help=f"Arch of the base model, must be one of: {all_models} (default: LlamaForCausalLM)", - default="LlamaForCausalLM" + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0", ) parser.add_argument( "--bigendian", action="store_true", @@ -73,14 +61,13 @@ def parse_args() -> argparse.Namespace: args = parse_args() logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - # FIXME: outtype is not working ftype_map: dict[str, gguf.LlamaFileType] = { "f32": gguf.LlamaFileType.ALL_F32, "f16": gguf.LlamaFileType.MOSTLY_F16, "bf16": gguf.LlamaFileType.MOSTLY_BF16, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, - "auto": gguf.LlamaFileType.GUESSED, } + ftype = ftype_map[args.outtype] dir_base_model = args.base dir_lora = args.lora_path @@ -110,7 +97,7 @@ def parse_args() -> argparse.Namespace: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) - model_instance = model_class(dir_base_model, ftype_map[args.outtype], fname_out, args.bigendian, False, False, None) + model_instance = model_class(dir_base_model, ftype, fname_out, args.bigendian, False, False, None) logger.info("Set model parameters") model_instance.set_gguf_parameters() @@ -140,16 +127,18 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: # overwrite method def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused + # TODO: This will not take into account tensor transformations return [(name, data_torch)] # overwrite method def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: del name, new_name, bid, n_dims # unused - return True + return ftype != gguf.LlamaFileType.ALL_F32 model_instance.get_tensors = types.MethodType(get_tensors, model_instance) model_instance.modify_tensors = types.MethodType(modify_tensors, model_instance) model_instance.extra_f16_tensors = types.MethodType(extra_f16_tensors, model_instance) + model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) logger.info("Exporting model...") model_instance.write() From d52455f2bec45d7e6df8da5b26b91d969ce4580d Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 8 Jul 2024 22:00:13 +0200 Subject: [PATCH 15/33] add requirements --- requirements/requirements-convert_lora_to_gguf.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements/requirements-convert_lora_to_gguf.txt diff --git a/requirements/requirements-convert_lora_to_gguf.txt b/requirements/requirements-convert_lora_to_gguf.txt new file mode 100644 index 0000000000000..5758076c41dc1 --- /dev/null +++ b/requirements/requirements-convert_lora_to_gguf.txt @@ -0,0 +1,2 @@ +-r ./requirements-convert_hf_to_gguf.txt +--extra-index-url https://download.pytorch.org/whl/cpu From 802565ca4327c3dbc02b83ad25ecd4b2bd8253b7 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 8 Jul 2024 22:01:23 +0200 Subject: [PATCH 16/33] fix requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 52456c2e6fd24..9e190ae27de38 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ -r ./requirements/requirements-convert_hf_to_gguf.txt -r ./requirements/requirements-convert_hf_to_gguf_update.txt -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt +-r ./requirements/requirements-convert_lora_to_gguf.txt From 95b3eb057b0261a48aeadcb1524a1f58d7ef39cc Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 8 Jul 2024 22:05:35 +0200 Subject: [PATCH 17/33] fix outfile --- convert_lora_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 861ab1e97f536..76c673101a46f 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -31,7 +31,7 @@ def parse_args() -> argparse.Namespace: description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file") parser.add_argument( "--outfile", type=Path, - help="path to write to; default: based on input.", + help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", @@ -77,7 +77,7 @@ def parse_args() -> argparse.Namespace: fname_out = args.outfile else: # output in the same directory as the model by default - fname_out = dir_lora / 'ggml-lora.gguf' + fname_out = dir_lora / 'ggml-lora-{ftype}.gguf' if os.path.exists(input_model): lora_model = torch.load(input_model, map_location="cpu") From ee2b35c65f7e4e862990d6460c4b0a0ac433a874 Mon Sep 17 00:00:00 2001 From: ngxson Date: Wed, 10 Jul 2024 00:23:07 +0200 Subject: [PATCH 18/33] conversion: only allow selected models --- convert_hf_to_gguf.py | 11 ++++++++-- convert_lora_to_gguf.py | 45 +++++++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6ee41d3a118e5..109135b6821aa 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -373,6 +373,9 @@ def from_model_architecture(cls, arch: str) -> type[Model]: except KeyError: raise NotImplementedError(f'Architecture {arch!r} not supported!') from None + def support_lora(self) -> bool: + return False + # used for GPT-2 BPE and WordPiece vocabs def get_vocab_base(self) -> tuple[list[str], list[int], str]: tokens: list[str] = [] @@ -1416,9 +1419,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") - if name.endswith(("q_proj.weight", "q_proj.bias")): + if name.endswith(("q_proj.weight", "q_proj.bias", "q_proj.lora_B.weight")): data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): + if name.endswith(("k_proj.weight", "k_proj.bias", "k_proj.lora_B.weight")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) # process the experts separately @@ -1466,6 +1469,10 @@ def write_tensors(self): if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") + def support_lora(self) -> bool: + # TODO: support lora conversion for MOE + return "num_local_experts" not in self.hparams + @Model.register("BitnetForCausalLM") class BitnetModel(Model): diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 76c673101a46f..c1ae1e9658788 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -9,7 +9,7 @@ import sys import types from pathlib import Path -from typing import TYPE_CHECKING, Iterable, Iterator +from typing import TYPE_CHECKING, Iterator import torch @@ -26,6 +26,13 @@ logger = logging.getLogger("lora-to-gguf") +def get_base_tensor_name(lora_tensor_name: str) -> str: + base_name = lora_tensor_name.replace("base_model.model.", "") + base_name = base_name.replace(".lora_A.weight", ".weight") + base_name = base_name.replace(".lora_B.weight", ".weight") + return base_name + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file") @@ -103,43 +110,47 @@ def parse_args() -> argparse.Namespace: # adapter_config = json.load(input_json) model_instance.gguf_writer.add_string("training.type", "finetune_lora") + if not model_instance.support_lora(): + logger.error("LoRA conversion is not yet supported for this model") + sys.exit(1) - map_tensors: dict[str, Tensor] = {} + # map original name to gguf name + map_name: dict[str, str] = {} for tensor_name, tensor in lora_model.items(): - orig_name = tensor_name.replace("base_model.model.", "") - orig_name = orig_name.replace(".lora_A.weight", ".weight") - orig_name = orig_name.replace(".lora_B.weight", ".weight") + base_name = get_base_tensor_name(tensor_name) is_lora_a = ".lora_A.weight" in tensor_name is_lora_b = ".lora_B.weight" in tensor_name if not is_lora_a and not is_lora_b: logger.error(f"Unexpected name '{tensor_name}': Not a lora_A or lora_B tensor") sys.exit(1) - dest_name = model_instance.map_tensor_name(orig_name) + dest_name = model_instance.map_tensor_name(base_name) dest_name = f"{dest_name}.lora_a" if is_lora_a else f"{dest_name}.lora_b" - # logger.info(f"{orig_name} --> {dest_name}") - map_tensors[dest_name] = tensor + map_name[tensor_name] = dest_name # overwrite method - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for name, tensor in map_tensors.items(): - yield (name, tensor) + def map_tensor_name(self, name: str) -> Iterator[tuple[str, Tensor]]: + return map_name[name] # overwrite method - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # TODO: This will not take into account tensor transformations - return [(name, data_torch)] + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for name, tensor in lora_model.items(): + yield (name, tensor) # overwrite method def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: del name, new_name, bid, n_dims # unused return ftype != gguf.LlamaFileType.ALL_F32 + model_instance._map_tensor_name = model_instance.map_tensor_name + model_instance.map_tensor_name = types.MethodType(map_tensor_name, model_instance) + + model_instance._get_tensors = model_instance.get_tensors model_instance.get_tensors = types.MethodType(get_tensors, model_instance) - model_instance.modify_tensors = types.MethodType(modify_tensors, model_instance) + + model_instance._extra_f16_tensors = model_instance.extra_f16_tensors model_instance.extra_f16_tensors = types.MethodType(extra_f16_tensors, model_instance) model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) logger.info("Exporting model...") model_instance.write() - logger.info(f"Model successfully exported to {fname_out}") + logger.info(f"Model successfully exported to {model_instance.fname_out}") From 713665db2ef05770dc3eb72b277034b2325758b0 Mon Sep 17 00:00:00 2001 From: ngxson Date: Wed, 10 Jul 2024 00:36:52 +0200 Subject: [PATCH 19/33] fix types --- convert_lora_to_gguf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index c1ae1e9658788..c7393ac3aceb2 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -128,7 +128,7 @@ def parse_args() -> argparse.Namespace: map_name[tensor_name] = dest_name # overwrite method - def map_tensor_name(self, name: str) -> Iterator[tuple[str, Tensor]]: + def map_tensor_name(self, name: str) -> str: return map_name[name] # overwrite method @@ -141,13 +141,13 @@ def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: i del name, new_name, bid, n_dims # unused return ftype != gguf.LlamaFileType.ALL_F32 - model_instance._map_tensor_name = model_instance.map_tensor_name + model_instance._map_tensor_name = model_instance.map_tensor_name # type: ignore model_instance.map_tensor_name = types.MethodType(map_tensor_name, model_instance) - model_instance._get_tensors = model_instance.get_tensors + model_instance._get_tensors = model_instance.get_tensors # type: ignore model_instance.get_tensors = types.MethodType(get_tensors, model_instance) - model_instance._extra_f16_tensors = model_instance.extra_f16_tensors + model_instance._extra_f16_tensors = model_instance.extra_f16_tensors # type: ignore model_instance.extra_f16_tensors = types.MethodType(extra_f16_tensors, model_instance) model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) From f15167a4c7532101aa61e3e093a92801bf0d3ead Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 10 Jul 2024 02:21:38 +0200 Subject: [PATCH 20/33] cuda : do not use dmmv if the tensor does not have enough cols --- ggml/src/ggml-cuda.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 1c9ccc8a15e54..dfd75e0e7090b 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1875,7 +1875,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1; + && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2 + && src1->ne[1] == 1; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; From 9841fbda7ceed5226283d7ad254b0d8f72305145 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 10 Jul 2024 02:21:53 +0200 Subject: [PATCH 21/33] llama : lora fixes --- src/llama.cpp | 83 ++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 48 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 278c7912d4752..fda48e822dc48 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2831,7 +2831,7 @@ struct llama_context { struct llama_lora_weight { struct ggml_tensor * a = nullptr; struct ggml_tensor * b = nullptr; - llama_lora_weight() {} + llama_lora_weight() = default; llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} }; @@ -18519,13 +18519,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { - static const int n_inp_tensors = 5; // see llama_model - static const int n_out_tensors = 5; // see llama_model LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ false, + /* .no_alloc = */ true, /* .ctx = */ &ctx, }; struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); @@ -18536,7 +18534,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c // check metadata { auto get_kv_str = [&](std::string key) -> std::string { - std::vector str_buf(32, 0); // we only get the arch, so no need big buffer here int id = gguf_find_key(ctx_gguf, key.c_str()); return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); }; @@ -18544,50 +18541,36 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); auto lora_arch = llm_arch_from_string(lora_arch_name); if (lora_arch != model->arch) { + gguf_free(ctx_gguf); throw std::runtime_error("model arch and LoRA arch mismatch"); } + auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE)); if (train_type != "finetune_lora") { + gguf_free(ctx_gguf); throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type); } } - // calculate n_tensors_per_layer - int n_tensors_per_layer = 0; - { - int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); - for (int i = 0; i < n_tensors; i++) { - int il = -1; - sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il); - if (il == 0) n_tensors_per_layer++; - } - } - - // count layer buffer types - std::map buft_tensor_count; - for (int64_t i = 0; i < model->hparams.n_layer; i++) { - buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer; - } - buft_tensor_count[model->buft_input.buft] += n_inp_tensors; - buft_tensor_count[model->buft_output.buft] += n_out_tensors; + int n_tensors = gguf_get_n_tensors(ctx_gguf); - // allocate contexts + // contexts for each buffer type std::map ctx_map; - { - auto new_ggml_ctx = [](size_t n_tensors) { + auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { + // add a new context struct ggml_init_params params = { /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - return ggml_init(params); + ggml_context * buft_ctx = ggml_init(params); + ctx_map[buft] = buft_ctx; + return buft_ctx; }; - for (auto & it : buft_tensor_count) { - int n_tensors = it.second; - // LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second); - ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors - } - } + return it->second; + }; // bundle lora_a and lora_b into pairs std::map ab_map; @@ -18611,33 +18594,40 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c ab_map[name].b = cur; } } else { - // maybe "optimizer.*"" tensors - LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name); + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); } } // add tensors for (auto & it : ab_map) { - std::string name = it.first; - const char * cname = name.c_str(); + const std::string & name = it.first; llama_lora_weight & w = it.second; - GGML_ASSERT(w.a != nullptr); - GGML_ASSERT(w.b != nullptr); - int il = -1; - sscanf(cname, "blk.%d.", &il); + + if (!w.a || !w.b) { + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); + } + // device buft and device ctx - auto model_tensor = llama_get_model_tensor(model, cname); + auto * model_tensor = llama_get_model_tensor(model, name.c_str()); if (!model_tensor) { gguf_free(ctx_gguf); ggml_free(ctx); throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); } - struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer)); + struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); // validate tensor shape if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { + gguf_free(ctx_gguf); + ggml_free(ctx); throw std::runtime_error("tensor '" + name + "' has incorrect shape"); } if (w.a->ne[1] != w.b->ne[0]) { + gguf_free(ctx_gguf); + ggml_free(ctx); throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); } // save tensor to adapter @@ -18661,7 +18651,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c ggml_free(ctx); throw std::runtime_error("failed to allocate buffer for lora adapter\n"); } - ggml_backend_buffer_clear(buf, 0); + LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); adapter.ctxs.push_back(ctx_dev); adapter.bufs.push_back(buf); } @@ -18674,12 +18664,9 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name)); size_t size = ggml_nbytes(orig); - if (read_buf.size() < size) { - read_buf.resize(size); - } + read_buf.resize(size); gguf_file.seek(offs, SEEK_SET); gguf_file.read_raw(read_buf.data(), size); - // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size); ggml_backend_tensor_set(dev, read_buf.data(), 0, size); }; for (auto & it : adapter.ab_map) { From 1faf7e5be6b339d0ac2f3b6615200627f18aa8dc Mon Sep 17 00:00:00 2001 From: ngxson Date: Wed, 10 Jul 2024 19:51:34 +0200 Subject: [PATCH 22/33] do not disable mmap with lora Co-authored-by: slaren --- common/common.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 9d9980cf18169..ddb1e79ae6ae3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -681,7 +681,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa if (arg == "--lora") { CHECK_ARG params.lora_adapter.emplace_back(argv[i], 1.0f); - params.use_mmap = false; return true; } if (arg == "--lora-scaled") { @@ -689,7 +688,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa const char* lora_adapter = argv[i]; CHECK_ARG params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); - params.use_mmap = false; return true; } if (arg == "--lora-base") { From 916e95928b0757fb9e5e601ee5f325af29e5253e Mon Sep 17 00:00:00 2001 From: ngxson Date: Thu, 11 Jul 2024 00:30:07 +0200 Subject: [PATCH 23/33] llm_build_lora_mm_id --- src/llama.cpp | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 7ed80fcafcc27..30ecbb801069d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7882,7 +7882,6 @@ static struct ggml_tensor * llm_build_lora_mm( if (lora == nullptr) { continue; } - // TODO: check if lora_a need transpose struct ggml_tensor * ab_cur = ggml_mul_mat( ctx0, lora->b, ggml_mul_mat(ctx0, lora->a, cur) @@ -7893,6 +7892,31 @@ static struct ggml_tensor * llm_build_lora_mm( return res; } +// do mat_mul_id, while optionally apply lora +static struct ggml_tensor * llm_build_lora_mm_id( + struct llama_context & lctx, + struct ggml_context * ctx0, + struct ggml_tensor * w, // struct ggml_tensor * as + struct ggml_tensor * cur, // struct ggml_tensor * b + struct ggml_tensor * ids) { + struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); + for (auto & it : lctx.lora_adapters) { + struct llama_lora_weight * lora = it.first->get_weight(w); + float scale = it.second; + if (lora == nullptr) { + continue; + } + struct ggml_tensor * ab_cur = ggml_mul_mat_id( + ctx0, lora->b, + ggml_mul_mat_id(ctx0, lora->a, cur, ids), + ids + ); + ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + return res; +} + static struct ggml_tensor * llm_build_norm( struct ggml_context * ctx, struct ggml_tensor * cur, @@ -8103,10 +8127,10 @@ static struct ggml_tensor * llm_build_moe_ffn( } cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); - ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] cb(up, "ffn_moe_up", il); - ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] cb(gate, "ffn_moe_gate", il); switch (type_op) { @@ -8127,7 +8151,7 @@ static struct ggml_tensor * llm_build_moe_ffn( ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] cb(par, "ffn_moe_gate_par", il); - ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); experts = ggml_mul(ctx, experts, weights); From 9d96328bdf81b2f39ed356ee0b78afa87963be10 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Tue, 9 Jul 2024 18:26:38 -0400 Subject: [PATCH 24/33] convert_lora : MoE LoRA conversion support * convert_lora : prefer safetensors, similarly to convert_hf --- convert_hf_to_gguf.py | 11 +- convert_lora_to_gguf.py | 262 ++++++++++++++++++++++++++++++++-------- 2 files changed, 216 insertions(+), 57 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 61f8e370c30fd..ebb5ca376133b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -373,9 +373,6 @@ def from_model_architecture(cls, arch: str) -> type[Model]: except KeyError: raise NotImplementedError(f'Architecture {arch!r} not supported!') from None - def support_lora(self) -> bool: - return False - # used for GPT-2 BPE and WordPiece vocabs def get_vocab_base(self) -> tuple[list[str], list[int], str]: tokens: list[str] = [] @@ -1415,9 +1412,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") - if name.endswith(("q_proj.weight", "q_proj.bias", "q_proj.lora_B.weight")): + if name.endswith(("q_proj.weight", "q_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias", "k_proj.lora_B.weight")): + if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) # process the experts separately @@ -1465,10 +1462,6 @@ def write_tensors(self): if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") - def support_lora(self) -> bool: - # TODO: support lora conversion for MOE - return "num_local_experts" not in self.hparams - @Model.register("BitnetForCausalLM") class BitnetModel(Model): diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index c7393ac3aceb2..2d01fdc466f9c 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -3,13 +3,14 @@ from __future__ import annotations +from dataclasses import dataclass import logging import argparse import os import sys -import types from pathlib import Path -from typing import TYPE_CHECKING, Iterator +from types import EllipsisType +from typing import TYPE_CHECKING, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast import torch @@ -26,6 +27,169 @@ logger = logging.getLogger("lora-to-gguf") +@dataclass +class PartialLoraTensor: + A: Tensor | None = None + B: Tensor | None = None + + +# magic to support tensor shape modifications and splitting +class LoraTorchTensor: + _lora_A: Tensor + _lora_B: Tensor + _rank: int + + def __init__(self, A: Tensor, B: Tensor): + assert len(A.shape) == len(B.shape) + if A.dtype != B.dtype: + A = A.to(torch.float32) + B = B.to(torch.float32) + self._lora_A = A + self._lora_B = B + assert self._lora_A.shape[-2] == self._lora_B.shape[-1] + self._rank = self._lora_B.shape[-1] + + def __getitem__( + self, + indices: ( + SupportsIndex + | slice + | tuple[SupportsIndex | slice | EllipsisType | Tensor, ...] + ), + ) -> LoraTorchTensor: + shape = self.shape + if isinstance(indices, (SupportsIndex, slice)): + if len(shape) > 2: + return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) + else: + raise NotImplementedError + elif isinstance(indices, tuple): + assert len(indices) > 0 + if isinstance(indices[-1], EllipsisType): + return self[indices[:-1]] + # expand ellipsis + indices = tuple( + u + for v in ( + ( + (slice(None, None) for _ in range(len(indices) - 1)) + if isinstance(i, EllipsisType) + else (i,) + ) + for i in indices + ) + for u in v + ) + + if len(indices) < len(shape): + indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape)))) + + # TODO: make sure this is correct + # lora_A has a shape which looks like (..., 1, 1, rank, self.shape[-1]) + indices_A = ( + *( + 0 if isinstance(i, SupportsIndex) else slice(None, None) + for i in indices[:-2] + ), + slice(None, None), + indices[-1], + ) + indices_B = indices[:-1] + return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B]) + else: + raise NotImplementedError + + @property + def dtype(self) -> torch.dtype: + assert self._lora_A.dtype == self._lora_B.dtype + return self._lora_A.dtype + + @property + def shape(self) -> tuple[int, ...]: + return (*self._lora_B.shape[:-1], self._lora_A.shape[-1]) + + def size(self, dim=None): + assert dim is None + return self.shape + + def reshape(self, *shape: int | tuple[int]) -> LoraTorchTensor: + if isinstance(shape[0], tuple): + new_shape: tuple[int] = shape[0] + else: + new_shape = cast(tuple[int], shape) + orig_shape = self.shape + if new_shape[-1] != orig_shape[-1]: + raise NotImplementedError + return LoraTorchTensor( + self._lora_A.reshape((*(1 for _ in new_shape[:-2]), *self._lora_A.shape[-2:])), + self._lora_B.reshape((*new_shape[:-1], self._rank)), + ) + + def reshape_as(self, other: Tensor) -> LoraTorchTensor: + return self.reshape(*other.shape) + + def view(self, *size: int) -> LoraTorchTensor: + return self.reshape(*size) + + def permute(self, *dims: int) -> LoraTorchTensor: + shape = self.shape + dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims) + if dims[-1] == -2 and dims[-2] == -1: + return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims)) + else: + assert dims[-1] == -1 + assert all(dim == 1 for dim in self._lora_A.shape[:-2]) + return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims)) + + def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor: + shape = self.shape + dims = [i for i in range(len(shape))] + dims[dim0], dims[dim1] = dims[dim1], dims[dim0] + return self.permute(*dims) + + def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor: + return self.transpose(axis0, axis1) + + def to(self, *args, **kwargs): + return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs)) + + @classmethod + def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): + del types # unused + + if kwargs is None: + kwargs = {} + + if func is torch.permute: + return type(args[0]).permute(*args, **kwargs) + elif func is torch.reshape: + return type(args[0]).reshape(*args, **kwargs) + elif func is torch.stack: + assert isinstance(args[0], Sequence) + dim = kwargs.get("dim", 0) + assert dim == 0 + return LoraTorchTensor( + torch.stack([a._lora_A for a in args[0]], dim), + torch.stack([b._lora_B for b in args[0]], dim), + ) + elif func is torch.cat: + assert isinstance(args[0], Sequence) + dim = kwargs.get("dim", 0) + assert dim == 0 + if len(args[0][0].shape) > 2: + return LoraTorchTensor( + torch.cat([a._lora_A for a in args[0]], dim), + torch.cat([b._lora_B for b in args[0]], dim), + ) + else: + return LoraTorchTensor( + args[0][0]._lora_A, # TODO: is this correct? (can't cat over the rank) + torch.cat([b._lora_B for b in args[0]], dim), + ) + else: + raise NotImplementedError + + def get_base_tensor_name(lora_tensor_name: str) -> str: base_name = lora_tensor_name.replace("base_model.model.", "") base_name = base_name.replace(".lora_A.weight", ".weight") @@ -79,7 +243,7 @@ def parse_args() -> argparse.Namespace: dir_base_model = args.base dir_lora = args.lora_path input_json = os.path.join(dir_lora, "adapter_config.json") - input_model = os.path.join(dir_lora, "adapter_model.bin") + input_model = os.path.join(dir_lora, "adapter_model.safetensors") if args.outfile is not None: fname_out = args.outfile else: @@ -87,12 +251,13 @@ def parse_args() -> argparse.Namespace: fname_out = dir_lora / 'ggml-lora-{ftype}.gguf' if os.path.exists(input_model): - lora_model = torch.load(input_model, map_location="cpu") - else: - input_model = os.path.join(dir_lora, "adapter_model.safetensors") # lazy import load_file only if lora is in safetensors format. from safetensors.torch import load_file + lora_model = load_file(input_model, device="cpu") + else: + input_model = os.path.join(dir_lora, "adapter_model.bin") + lora_model = torch.load(input_model, map_location="cpu", weights_only=True) # load base model logger.info(f"Loading base model: {dir_base_model.name}") @@ -104,53 +269,54 @@ def parse_args() -> argparse.Namespace: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) - model_instance = model_class(dir_base_model, ftype, fname_out, args.bigendian, False, False, None) - logger.info("Set model parameters") - model_instance.set_gguf_parameters() + class LoraModel(model_class): + model_arch = model_class.model_arch - # adapter_config = json.load(input_json) - model_instance.gguf_writer.add_string("training.type", "finetune_lora") - if not model_instance.support_lora(): - logger.error("LoRA conversion is not yet supported for this model") - sys.exit(1) - - # map original name to gguf name - map_name: dict[str, str] = {} - for tensor_name, tensor in lora_model.items(): - base_name = get_base_tensor_name(tensor_name) - is_lora_a = ".lora_A.weight" in tensor_name - is_lora_b = ".lora_B.weight" in tensor_name - if not is_lora_a and not is_lora_b: - logger.error(f"Unexpected name '{tensor_name}': Not a lora_A or lora_B tensor") - sys.exit(1) - dest_name = model_instance.map_tensor_name(base_name) - dest_name = f"{dest_name}.lora_a" if is_lora_a else f"{dest_name}.lora_b" - map_name[tensor_name] = dest_name + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + tensor_map: dict[str, PartialLoraTensor] = {} - # overwrite method - def map_tensor_name(self, name: str) -> str: - return map_name[name] + for name, tensor in lora_model.items(): + base_name = get_base_tensor_name(name) + is_lora_a = ".lora_A.weight" in name + is_lora_b = ".lora_B.weight" in name + if not is_lora_a and not is_lora_b: + if ".base_layer.weight" in name: + continue + logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") + sys.exit(1) - # overwrite method - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for name, tensor in lora_model.items(): - yield (name, tensor) + if base_name in tensor_map: + if is_lora_a: + tensor_map[base_name].A = tensor + else: + tensor_map[base_name].B = tensor + else: + if is_lora_a: + tensor_map[base_name] = PartialLoraTensor(A=tensor) + else: + tensor_map[base_name] = PartialLoraTensor(B=tensor) - # overwrite method - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del name, new_name, bid, n_dims # unused - return ftype != gguf.LlamaFileType.ALL_F32 + for name, tensor in tensor_map.items(): + assert tensor.A is not None + assert tensor.B is not None + yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B))) - model_instance._map_tensor_name = model_instance.map_tensor_name # type: ignore - model_instance.map_tensor_name = types.MethodType(map_tensor_name, model_instance) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + dest = super().modify_tensors(data_torch, name, bid) + for dest_name, dest_data in dest: + assert isinstance(dest_data, LoraTorchTensor) + # logger.info(f"{orig_name} --> {dest_name}") + yield (dest_name + ".lora_a", dest_data._lora_A) + yield (dest_name + ".lora_b", dest_data._lora_B) - model_instance._get_tensors = model_instance.get_tensors # type: ignore - model_instance.get_tensors = types.MethodType(get_tensors, model_instance) + model_instance = LoraModel(dir_base_model, ftype, fname_out, args.bigendian, False, False, None) + logger.info("Set model parameters") + model_instance.set_gguf_parameters() - model_instance._extra_f16_tensors = model_instance.extra_f16_tensors # type: ignore - model_instance.extra_f16_tensors = types.MethodType(extra_f16_tensors, model_instance) + # adapter_config = json.load(input_json) + model_instance.gguf_writer.add_string("training.type", "finetune_lora") - model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - logger.info("Exporting model...") - model_instance.write() - logger.info(f"Model successfully exported to {model_instance.fname_out}") + model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + logger.info("Exporting model...") + model_instance.write() + logger.info(f"Model successfully exported to {model_instance.fname_out}") From 8956543c091c6851089ed7467fa44ac2b2b0ee37 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Mon, 15 Jul 2024 02:35:06 -0400 Subject: [PATCH 25/33] convert_hf : simplify modify_tensors for InternLM2 * convert_lora : lazy conversion * llama : load and use alpha from LoRA adapters --- convert_hf_to_gguf.py | 33 ++++------ convert_lora_to_gguf.py | 134 ++++++++++++++++++++++++++++------------ gguf-py/gguf/quants.py | 2 +- src/llama.cpp | 22 +++++-- 4 files changed, 124 insertions(+), 67 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ebb5ca376133b..70ea963f2e879 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2222,13 +2222,6 @@ def set_vocab(self): special_vocab.add_to_gguf(self.gguf_writer) - def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - def set_gguf_parameters(self): self.gguf_writer.add_name("InternLM2") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) @@ -2248,26 +2241,22 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: num_heads = self.hparams["num_attention_heads"] num_kv_heads = self.hparams["num_key_value_heads"] - hidden_size = self.hparams["hidden_size"] + n_embd = self.hparams["hidden_size"] q_per_kv = num_heads // num_kv_heads - head_dim = hidden_size // num_heads + head_dim = n_embd // num_heads num_groups = num_heads // q_per_kv - qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" - - if re.match(qkv_pattern, name): - bid = re.findall(qkv_pattern, name)[0] + if bid is not None and f"model.layers.{bid}.attention.wqkv" in name: qkv = data_torch - # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) - qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim)) - q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] + + qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) + q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] + # The model weights of q and k equire additional reshape. - # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) - q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads) - # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) - k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads) - # v = rearrange(v, " o g n i -> o (g n i)").T - v = v.reshape((v.shape[0], -1)).T + q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) + k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) + v = v.reshape((-1, v.shape[-1])) + return [ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q), (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 2d01fdc466f9c..71d3e57f55720 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -8,9 +8,10 @@ import argparse import os import sys +import json +from math import prod from pathlib import Path -from types import EllipsisType -from typing import TYPE_CHECKING, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast +from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast import torch @@ -22,7 +23,7 @@ import gguf # reuse model definitions from convert_hf_to_gguf.py -from convert_hf_to_gguf import Model +from convert_hf_to_gguf import LazyTorchTensor, Model logger = logging.getLogger("lora-to-gguf") @@ -35,37 +36,45 @@ class PartialLoraTensor: # magic to support tensor shape modifications and splitting class LoraTorchTensor: - _lora_A: Tensor - _lora_B: Tensor + _lora_A: Tensor # (n_rank, row_size) + _lora_B: Tensor # (col_size, n_rank) _rank: int def __init__(self, A: Tensor, B: Tensor): assert len(A.shape) == len(B.shape) + assert A.shape[-2] == B.shape[-1] if A.dtype != B.dtype: A = A.to(torch.float32) B = B.to(torch.float32) self._lora_A = A self._lora_B = B - assert self._lora_A.shape[-2] == self._lora_B.shape[-1] - self._rank = self._lora_B.shape[-1] + self._rank = B.shape[-1] + + def get_lora_A_B(self) -> tuple[Tensor, Tensor]: + return (self._lora_A, self._lora_B) def __getitem__( self, indices: ( SupportsIndex | slice - | tuple[SupportsIndex | slice | EllipsisType | Tensor, ...] + | tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature ), ) -> LoraTorchTensor: shape = self.shape - if isinstance(indices, (SupportsIndex, slice)): + if isinstance(indices, SupportsIndex): if len(shape) > 2: return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) else: - raise NotImplementedError + raise NotImplementedError # can't return a vector + elif isinstance(indices, slice): + if len(shape) > 2: + return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) + else: + return LoraTorchTensor(self._lora_A, self._lora_B[indices]) elif isinstance(indices, tuple): assert len(indices) > 0 - if isinstance(indices[-1], EllipsisType): + if indices[-1] is Ellipsis: return self[indices[:-1]] # expand ellipsis indices = tuple( @@ -73,7 +82,7 @@ def __getitem__( for v in ( ( (slice(None, None) for _ in range(len(indices) - 1)) - if isinstance(i, EllipsisType) + if i is Ellipsis else (i,) ) for i in indices @@ -85,11 +94,14 @@ def __getitem__( indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape)))) # TODO: make sure this is correct - # lora_A has a shape which looks like (..., 1, 1, rank, self.shape[-1]) indices_A = ( *( - 0 if isinstance(i, SupportsIndex) else slice(None, None) - for i in indices[:-2] + ( + j.__index__() % self._lora_A.shape[i] + if isinstance(j, SupportsIndex) + else slice(None, None) + ) + for i, j in enumerate(indices[:-2]) ), slice(None, None), indices[-1], @@ -97,7 +109,7 @@ def __getitem__( indices_B = indices[:-1] return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B]) else: - raise NotImplementedError + raise NotImplementedError # unknown indice type @property def dtype(self) -> torch.dtype: @@ -106,23 +118,37 @@ def dtype(self) -> torch.dtype: @property def shape(self) -> tuple[int, ...]: + assert len(self._lora_A.shape) == len(self._lora_B.shape) return (*self._lora_B.shape[:-1], self._lora_A.shape[-1]) def size(self, dim=None): assert dim is None return self.shape - def reshape(self, *shape: int | tuple[int]) -> LoraTorchTensor: + def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: if isinstance(shape[0], tuple): - new_shape: tuple[int] = shape[0] + new_shape: tuple[int, ...] = shape[0] else: - new_shape = cast(tuple[int], shape) + new_shape = cast(tuple[int, ...], shape) orig_shape = self.shape + if len(new_shape) < 2: + raise NotImplementedError # can't become a vector + + # expand -1 in the shape + if any(dim == -1 for dim in new_shape): + n_elems = prod(orig_shape) + n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape) + assert n_elems % n_new_elems == 0 + new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),) + if new_shape[-1] != orig_shape[-1]: - raise NotImplementedError + raise NotImplementedError # can't reshape the row size trivially + + shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1]) + shape_B = (*new_shape[:-1], self._rank) return LoraTorchTensor( - self._lora_A.reshape((*(1 for _ in new_shape[:-2]), *self._lora_A.shape[-2:])), - self._lora_B.reshape((*new_shape[:-1], self._rank)), + self._lora_A.reshape(shape_A), + self._lora_B.reshape(shape_B), ) def reshape_as(self, other: Tensor) -> LoraTorchTensor: @@ -134,12 +160,15 @@ def view(self, *size: int) -> LoraTorchTensor: def permute(self, *dims: int) -> LoraTorchTensor: shape = self.shape dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims) - if dims[-1] == -2 and dims[-2] == -1: - return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims)) - else: - assert dims[-1] == -1 + if dims[-1] == -1: + # TODO: support higher dimensional A shapes bigger than 1 assert all(dim == 1 for dim in self._lora_A.shape[:-2]) return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims)) + if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1: + return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims)) + else: + # TODO: compose the above two + raise NotImplementedError def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor: shape = self.shape @@ -181,11 +210,13 @@ def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): torch.cat([a._lora_A for a in args[0]], dim), torch.cat([b._lora_B for b in args[0]], dim), ) - else: + elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]): return LoraTorchTensor( - args[0][0]._lora_A, # TODO: is this correct? (can't cat over the rank) + args[0][0]._lora_A, torch.cat([b._lora_B for b in args[0]], dim), ) + else: + raise NotImplementedError else: raise NotImplementedError @@ -205,13 +236,17 @@ def parse_args() -> argparse.Namespace: help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0", + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( "--bigendian", action="store_true", help="model is executed on big endian machine", ) + parser.add_argument( + "--no-lazy", action="store_true", + help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", + ) parser.add_argument( "--verbose", action="store_true", help="increase output verbosity", @@ -237,13 +272,16 @@ def parse_args() -> argparse.Namespace: "f16": gguf.LlamaFileType.MOSTLY_F16, "bf16": gguf.LlamaFileType.MOSTLY_BF16, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + "auto": gguf.LlamaFileType.GUESSED, } + ftype = ftype_map[args.outtype] - dir_base_model = args.base - dir_lora = args.lora_path - input_json = os.path.join(dir_lora, "adapter_config.json") - input_model = os.path.join(dir_lora, "adapter_model.safetensors") + dir_base_model: Path = args.base + dir_lora: Path = args.lora_path + lora_config = dir_lora / "adapter_config.json" + input_model = dir_lora / "adapter_model.safetensors" + if args.outfile is not None: fname_out = args.outfile else: @@ -276,6 +314,8 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_map: dict[str, PartialLoraTensor] = {} for name, tensor in lora_model.items(): + if self.lazy: + tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) is_lora_a = ".lora_A.weight" in name is_lora_b = ".lora_B.weight" in name @@ -305,16 +345,30 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter dest = super().modify_tensors(data_torch, name, bid) for dest_name, dest_data in dest: assert isinstance(dest_data, LoraTorchTensor) - # logger.info(f"{orig_name} --> {dest_name}") - yield (dest_name + ".lora_a", dest_data._lora_A) - yield (dest_name + ".lora_b", dest_data._lora_B) - - model_instance = LoraModel(dir_base_model, ftype, fname_out, args.bigendian, False, False, None) + lora_a, lora_b = dest_data.get_lora_A_B() + + yield (dest_name + ".lora_a", lora_a) + yield (dest_name + ".lora_b", lora_b) + + model_instance = LoraModel( + dir_base_model, + ftype, + fname_out, + is_big_endian=args.bigendian, + use_temp_file=False, + eager=args.no_lazy, + model_name=None, + ) logger.info("Set model parameters") model_instance.set_gguf_parameters() - # adapter_config = json.load(input_json) + with open(lora_config, "r") as f: + lparams: dict[str, Any] = json.load(f) + + alpha = lparams["lora_alpha"] + model_instance.gguf_writer.add_string("training.type", "finetune_lora") + model_instance.gguf_writer.add_float32("training.lora.alpha", float(alpha)) model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) logger.info("Exporting model...") diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index b22eec1661ce7..16e0a9aaa8a8b 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -43,7 +43,7 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np. osize *= dim out = np.empty(shape=osize, dtype=otype) # compute over groups of 16 rows (arbitrary, but seems good for performance) - n_groups = rows.shape[0] // 16 + n_groups = (rows.shape[0] // 16) or 1 np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out) return out.reshape(oshape) diff --git a/src/llama.cpp b/src/llama.cpp index 30ecbb801069d..3906b9ea16f7b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -379,6 +379,7 @@ enum llm_kv { LLM_KV_TOKENIZER_EOT_ID, LLM_KV_TRAINING_TYPE, + LLM_KV_TRAINING_LORA_ALPHA, }; static const std::map LLM_KV_NAMES = { @@ -473,7 +474,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, - { LLM_KV_TRAINING_TYPE, "training.type" }, + { LLM_KV_TRAINING_TYPE, "training.type" }, + { LLM_KV_TRAINING_LORA_ALPHA, "training.lora.alpha" }, }; struct LLM_KV { @@ -2848,6 +2850,8 @@ struct llama_lora_adapter { std::vector ctxs; std::vector bufs; + float alpha; + llama_lora_adapter(struct llama_model * base_model): base_model(base_model) { base_model->lora_adapters.insert(this); } @@ -7878,10 +7882,12 @@ static struct ggml_tensor * llm_build_lora_mm( struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); for (auto & it : lctx.lora_adapters) { struct llama_lora_weight * lora = it.first->get_weight(w); - float scale = it.second; if (lora == nullptr) { continue; } + const float alpha = it.first->alpha; + const float rank = (float) lora->b->ne[0]; + const float scale = alpha ? it.second * alpha / rank : it.second; struct ggml_tensor * ab_cur = ggml_mul_mat( ctx0, lora->b, ggml_mul_mat(ctx0, lora->a, cur) @@ -7902,10 +7908,12 @@ static struct ggml_tensor * llm_build_lora_mm_id( struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); for (auto & it : lctx.lora_adapters) { struct llama_lora_weight * lora = it.first->get_weight(w); - float scale = it.second; if (lora == nullptr) { continue; } + const float alpha = it.first->alpha; + const float rank = (float) lora->b->ne[0]; + const float scale = alpha ? it.second * alpha / rank : it.second; struct ggml_tensor * ab_cur = ggml_mul_mat_id( ctx0, lora->b, ggml_mul_mat_id(ctx0, lora->a, cur, ids), @@ -18587,10 +18595,14 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c // check metadata { - auto get_kv_str = [&](std::string key) -> std::string { + auto get_kv_str = [&](const std::string & key) -> std::string { int id = gguf_find_key(ctx_gguf, key.c_str()); return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); }; + auto get_kv_f32 = [&](const std::string & key) -> float { + int id = gguf_find_key(ctx_gguf, key.c_str()); + return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id); + }; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); auto lora_arch = llm_arch_from_string(lora_arch_name); @@ -18604,6 +18616,8 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c gguf_free(ctx_gguf); throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type); } + + adapter.alpha = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_ALPHA)); } int n_tensors = gguf_get_n_tensors(ctx_gguf); From 87301bdd59554604ce0103fe39580a1608cf97cd Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Mon, 15 Jul 2024 03:23:19 -0400 Subject: [PATCH 26/33] llama : use llm_build_lora_mm in most model graphs --- src/llama.cpp | 238 +++++++++++++++++++++++++------------------------- 1 file changed, 119 insertions(+), 119 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 3906b9ea16f7b..5c7edf6b7f496 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8945,13 +8945,13 @@ struct llm_build_context { // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); switch (model.type) { @@ -9024,7 +9024,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -9060,13 +9060,13 @@ struct llm_build_context { // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -9127,7 +9127,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -9176,7 +9176,7 @@ struct llm_build_context { cur = attn_norm; } - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); @@ -9247,7 +9247,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -9292,21 +9292,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -9398,7 +9398,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); // Grok // multiply logits by output_multiplier_scale of 0.5773502691896257 @@ -9449,7 +9449,7 @@ struct llm_build_context { struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Vcur = nullptr; - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -9529,7 +9529,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -9571,7 +9571,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -9634,7 +9634,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -9666,13 +9666,13 @@ struct llm_build_context { // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); @@ -9728,7 +9728,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -9780,7 +9780,7 @@ struct llm_build_context { // self-attention if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { - Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); + Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq); cb(Qcur, "Qcur", il); if (model.layers[il].attn_q_norm) { @@ -9790,7 +9790,7 @@ struct llm_build_context { LLM_NORM, cb, il); } - Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk); + Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk); cb(Kcur, "Kcur", il); if (model.layers[il].attn_k_norm) { @@ -9799,14 +9799,14 @@ struct llm_build_context { model.layers[il].attn_k_norm_b, LLM_NORM, cb, il); } - Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv); + Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); } else { // compute Q and K and RoPE them - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); @@ -9855,7 +9855,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); if (model.layers[il].bo) { cb(cur, "kqv_wo", il); } @@ -9960,7 +9960,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -10023,7 +10023,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10070,7 +10070,7 @@ struct llm_build_context { { cur = attn_norm; - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); if (model.layers[il].bqkv){ @@ -10163,7 +10163,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10203,21 +10203,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -10313,7 +10313,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10348,7 +10348,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -10425,7 +10425,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10463,17 +10463,17 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -10537,7 +10537,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10578,17 +10578,17 @@ struct llm_build_context { // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -10643,7 +10643,7 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur); + ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur); cb(cur_gate_inp, "ffn_shexp_gate_inp", il); // sigmoid @@ -10683,7 +10683,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10725,7 +10725,7 @@ struct llm_build_context { struct ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -10735,9 +10735,9 @@ struct llm_build_context { Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); } else { - Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); - Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); - Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); } cb(Qcur, "Qcur", il); @@ -10803,7 +10803,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output_no_bias", -1); cur = ggml_add(ctx0, cur, model.output_b); @@ -10849,7 +10849,7 @@ struct llm_build_context { struct ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output); cb(cur, "wqkv", il); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); @@ -10857,9 +10857,9 @@ struct llm_build_context { Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); } else { - Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); - Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); - Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); } cb(Qcur, "Qcur", il); @@ -10931,7 +10931,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10971,13 +10971,13 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -11036,7 +11036,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11078,7 +11078,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -11141,7 +11141,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11177,7 +11177,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -11252,7 +11252,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11290,21 +11290,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); // if (model.layers[il].bq) { // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); // cb(Qcur, "Qcur", il); // } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); // if (model.layers[il].bk) { // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); // cb(Kcur, "Kcur", il); // } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); // if (model.layers[il].bv) { // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -11370,7 +11370,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11408,21 +11408,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -11488,7 +11488,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11539,21 +11539,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -11635,7 +11635,7 @@ struct llm_build_context { cb(cur, "lmhead_scaling", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11672,13 +11672,13 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -11743,7 +11743,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11785,13 +11785,13 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -11866,7 +11866,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); // final logit soft-capping cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); @@ -11911,21 +11911,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -11992,7 +11992,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -12044,7 +12044,7 @@ struct llm_build_context { cb(cur, "attn_norm", il); // {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens} - struct ggml_tensor * xz = ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur); + struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_in, cur); // split the above in two // => {d_inner, n_tokens} struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0); @@ -12083,14 +12083,14 @@ struct llm_build_context { // ssm { // {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens} - struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x); + struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_x, x); // split struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0); struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank); struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state)); // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens} - dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt); + dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt); dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); // Custom operator to optimize the parallel associative scan @@ -12121,7 +12121,7 @@ struct llm_build_context { y = ggml_mul(ctx0, y, ggml_silu(ctx0, z)); // {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens} - cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_out, y); } // residual @@ -12140,7 +12140,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -12179,21 +12179,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -12283,7 +12283,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); @@ -12336,21 +12336,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (hparams.f_clamp_kqv > 0.0f) { Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (hparams.f_clamp_kqv > 0.0f) { Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (hparams.f_clamp_kqv > 0.0f) { Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -12419,7 +12419,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -12459,7 +12459,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); @@ -12544,7 +12544,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -12579,7 +12579,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -12686,7 +12686,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -12727,13 +12727,13 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -12818,7 +12818,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -13087,7 +13087,7 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { @@ -13096,7 +13096,7 @@ struct llm_build_context { } // B1.K - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { @@ -13105,7 +13105,7 @@ struct llm_build_context { } // B1.V - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { @@ -13136,7 +13136,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "attn_sub_norm", il); - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale); if (model.layers[il].bo) { cur = ggml_add(ctx0, cur, model.layers[il].bo); @@ -13173,7 +13173,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_sub_norm", il); - cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); cb(cur, "ffn_down", il); @@ -13192,7 +13192,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -13540,7 +13540,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -13599,7 +13599,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -13641,7 +13641,7 @@ struct llm_build_context { struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Vcur = nullptr; - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -13714,7 +13714,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); From 42415a4874e0f963e4aca6796ea5dfb97cd17464 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 15 Jul 2024 11:41:18 +0200 Subject: [PATCH 27/33] auto scale --- common/common.cpp | 5 ++++- convert_lora_to_gguf.py | 2 ++ include/llama.h | 35 +++++++++++++++++++++-------------- src/llama.cpp | 10 +++++++++- 4 files changed, 36 insertions(+), 16 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index dac152c4fc4b6..4cc71179c8dca 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -684,7 +684,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--lora") { CHECK_ARG - params.lora_adapter.emplace_back(argv[i], 1.0f); + params.lora_adapter.emplace_back(argv[i], 0.0f); return true; } if (arg == "--lora-scaled") { @@ -2089,6 +2089,9 @@ std::tuple llama_init_from_gpt_par llama_free_model(model); return std::make_tuple(nullptr, nullptr); } + if (lora_scale == 0.0f) { + lora_scale = llama_lora_adapter_get_default_scale(adapter); + } llama_lora_adapter_set(lctx, adapter, lora_scale); } diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 71d3e57f55720..be0b6f272084d 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -366,9 +366,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter lparams: dict[str, Any] = json.load(f) alpha = lparams["lora_alpha"] + rank = lparams["r"] model_instance.gguf_writer.add_string("training.type", "finetune_lora") model_instance.gguf_writer.add_float32("training.lora.alpha", float(alpha)) + model_instance.gguf_writer.add_float32("training.lora.scale", float(alpha) / float(rank)) model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) logger.info("Exporting model...") diff --git a/include/llama.h b/include/llama.h index c57d21f0c70b9..01ea884669cb9 100644 --- a/include/llama.h +++ b/include/llama.h @@ -513,12 +513,33 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); + // Apply a loaded control vector to a llama_context, or if data is NULL, clear + // the currently loaded vector. + // n_embd should be the size of a single layer's control, and data should point + // to an n_embd x n_layers buffer starting from layer 1. + // il_start and il_end are the layer range the vector should apply to (both inclusive) + // See llama_control_vector_load in common to load a control vector. + LLAMA_API int32_t llama_control_vector_apply( + struct llama_context * lctx, + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end); + + // + // LoRA + // + // Load a LoRA adapter from file // The loaded adapter will be associated to the given model, and will be free when the model is deleted LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( struct llama_model * model, const char * path_lora); + // Get default scale of an adapter + LLAMA_API float llama_lora_adapter_get_default_scale(struct llama_lora_adapter * adapter); + // Add a loaded LoRA adapter to given context // This will not modify model's weight LLAMA_API int32_t llama_lora_adapter_set( @@ -536,20 +557,6 @@ extern "C" { // Note: loaded adapters will be free when the associated model is deleted LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter); - // Apply a loaded control vector to a llama_context, or if data is NULL, clear - // the currently loaded vector. - // n_embd should be the size of a single layer's control, and data should point - // to an n_embd x n_layers buffer starting from layer 1. - // il_start and il_end are the layer range the vector should apply to (both inclusive) - // See llama_control_vector_load in common to load a control vector. - LLAMA_API int32_t llama_control_vector_apply( - struct llama_context * lctx, - const float * data, - size_t len, - int32_t n_embd, - int32_t il_start, - int32_t il_end); - // // KV cache // diff --git a/src/llama.cpp b/src/llama.cpp index d5a7bb62bda2b..4c77b101437e0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -380,6 +380,7 @@ enum llm_kv { LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_LORA_ALPHA, + LLM_KV_TRAINING_LORA_SCALE, }; static const std::map LLM_KV_NAMES = { @@ -476,6 +477,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TRAINING_TYPE, "training.type" }, { LLM_KV_TRAINING_LORA_ALPHA, "training.lora.alpha" }, + { LLM_KV_TRAINING_LORA_SCALE, "training.lora.scale" }, }; struct LLM_KV { @@ -2851,6 +2853,7 @@ struct llama_lora_adapter { std::vector bufs; float alpha; + float scale; // default scale llama_lora_adapter(struct llama_model * base_model): base_model(base_model) { base_model->lora_adapters.insert(this); @@ -18578,7 +18581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { - LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora); + LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -18615,6 +18618,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c } adapter.alpha = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_ALPHA)); + adapter.scale = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_SCALE)); } int n_tensors = gguf_get_n_tensors(ctx_gguf); @@ -18749,6 +18753,10 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c ggml_free(ctx); } +float llama_lora_adapter_get_default_scale(struct llama_lora_adapter * adapter) { + return adapter->scale; +} + int32_t llama_lora_adapter_set( struct llama_context * ctx, struct llama_lora_adapter * adapter, From 5b181182488796da836651fa8e053ca7fcb34192 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 15 Jul 2024 11:48:51 +0200 Subject: [PATCH 28/33] Revert "auto scale" This reverts commit 42415a4874e0f963e4aca6796ea5dfb97cd17464. --- common/common.cpp | 5 +---- convert_lora_to_gguf.py | 2 -- include/llama.h | 35 ++++++++++++++--------------------- src/llama.cpp | 10 +--------- 4 files changed, 16 insertions(+), 36 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 4cc71179c8dca..dac152c4fc4b6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -684,7 +684,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--lora") { CHECK_ARG - params.lora_adapter.emplace_back(argv[i], 0.0f); + params.lora_adapter.emplace_back(argv[i], 1.0f); return true; } if (arg == "--lora-scaled") { @@ -2089,9 +2089,6 @@ std::tuple llama_init_from_gpt_par llama_free_model(model); return std::make_tuple(nullptr, nullptr); } - if (lora_scale == 0.0f) { - lora_scale = llama_lora_adapter_get_default_scale(adapter); - } llama_lora_adapter_set(lctx, adapter, lora_scale); } diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index be0b6f272084d..71d3e57f55720 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -366,11 +366,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter lparams: dict[str, Any] = json.load(f) alpha = lparams["lora_alpha"] - rank = lparams["r"] model_instance.gguf_writer.add_string("training.type", "finetune_lora") model_instance.gguf_writer.add_float32("training.lora.alpha", float(alpha)) - model_instance.gguf_writer.add_float32("training.lora.scale", float(alpha) / float(rank)) model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) logger.info("Exporting model...") diff --git a/include/llama.h b/include/llama.h index 01ea884669cb9..c57d21f0c70b9 100644 --- a/include/llama.h +++ b/include/llama.h @@ -513,33 +513,12 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); - // Apply a loaded control vector to a llama_context, or if data is NULL, clear - // the currently loaded vector. - // n_embd should be the size of a single layer's control, and data should point - // to an n_embd x n_layers buffer starting from layer 1. - // il_start and il_end are the layer range the vector should apply to (both inclusive) - // See llama_control_vector_load in common to load a control vector. - LLAMA_API int32_t llama_control_vector_apply( - struct llama_context * lctx, - const float * data, - size_t len, - int32_t n_embd, - int32_t il_start, - int32_t il_end); - - // - // LoRA - // - // Load a LoRA adapter from file // The loaded adapter will be associated to the given model, and will be free when the model is deleted LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( struct llama_model * model, const char * path_lora); - // Get default scale of an adapter - LLAMA_API float llama_lora_adapter_get_default_scale(struct llama_lora_adapter * adapter); - // Add a loaded LoRA adapter to given context // This will not modify model's weight LLAMA_API int32_t llama_lora_adapter_set( @@ -557,6 +536,20 @@ extern "C" { // Note: loaded adapters will be free when the associated model is deleted LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter); + // Apply a loaded control vector to a llama_context, or if data is NULL, clear + // the currently loaded vector. + // n_embd should be the size of a single layer's control, and data should point + // to an n_embd x n_layers buffer starting from layer 1. + // il_start and il_end are the layer range the vector should apply to (both inclusive) + // See llama_control_vector_load in common to load a control vector. + LLAMA_API int32_t llama_control_vector_apply( + struct llama_context * lctx, + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end); + // // KV cache // diff --git a/src/llama.cpp b/src/llama.cpp index 4c77b101437e0..d5a7bb62bda2b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -380,7 +380,6 @@ enum llm_kv { LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_LORA_ALPHA, - LLM_KV_TRAINING_LORA_SCALE, }; static const std::map LLM_KV_NAMES = { @@ -477,7 +476,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TRAINING_TYPE, "training.type" }, { LLM_KV_TRAINING_LORA_ALPHA, "training.lora.alpha" }, - { LLM_KV_TRAINING_LORA_SCALE, "training.lora.scale" }, }; struct LLM_KV { @@ -2853,7 +2851,6 @@ struct llama_lora_adapter { std::vector bufs; float alpha; - float scale; // default scale llama_lora_adapter(struct llama_model * base_model): base_model(base_model) { base_model->lora_adapters.insert(this); @@ -18581,7 +18578,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { - LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); + LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -18618,7 +18615,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c } adapter.alpha = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_ALPHA)); - adapter.scale = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_SCALE)); } int n_tensors = gguf_get_n_tensors(ctx_gguf); @@ -18753,10 +18749,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c ggml_free(ctx); } -float llama_lora_adapter_get_default_scale(struct llama_lora_adapter * adapter) { - return adapter->scale; -} - int32_t llama_lora_adapter_set( struct llama_context * ctx, struct llama_lora_adapter * adapter, From f68d092459059df92b0ba68b0b64282c1d56c56d Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 15 Jul 2024 12:12:22 +0200 Subject: [PATCH 29/33] remove redundant params --- src/llama.cpp | 104 +++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index d5a7bb62bda2b..bc2d53c967add 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8079,8 +8079,8 @@ static struct ggml_tensor * llm_build_ffn( } static struct ggml_tensor * llm_build_moe_ffn( - struct llama_context & lctx, struct ggml_context * ctx, + struct llama_context & lctx, struct ggml_tensor * cur, struct ggml_tensor * gate_inp, struct ggml_tensor * up_exps, @@ -8180,11 +8180,8 @@ static struct ggml_tensor * llm_build_moe_ffn( } static struct ggml_tensor * llm_build_kqv( - struct llama_context & lctx, struct ggml_context * ctx, - const llama_model & model, - const llama_hparams & hparams, - const llama_cparams & cparams, + struct llama_context & lctx, const llama_kv_cache & kv, struct ggml_cgraph * graph, struct ggml_tensor * wo, @@ -8196,6 +8193,10 @@ static struct ggml_tensor * llm_build_kqv( float kq_scale, const llm_build_cb & cb, int il) { + const llama_model & model = lctx.model; + const llama_hparams & hparams = lctx.model.hparams; + const llama_cparams & cparams = lctx.cparams; + const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head(il); const int64_t n_head_kv = hparams.n_head_kv(il); @@ -8309,11 +8310,8 @@ static struct ggml_tensor * llm_build_kqv( } static struct ggml_tensor * llm_build_kv( - struct llama_context & lctx, struct ggml_context * ctx, - const llama_model & model, - const llama_hparams & hparams, - const llama_cparams & cparams, + struct llama_context & lctx, const llama_kv_cache & kv, struct ggml_cgraph * graph, struct ggml_tensor * wo, @@ -8328,6 +8326,8 @@ static struct ggml_tensor * llm_build_kv( float kq_scale, const llm_build_cb & cb, int il) { + const llama_hparams & hparams = lctx.model.hparams; + const llama_cparams & cparams = lctx.cparams; // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced @@ -8339,7 +8339,7 @@ static struct ggml_tensor * llm_build_kv( struct ggml_tensor * cur; - cur = llm_build_kqv(lctx, ctx, model, hparams, cparams, kv, graph, wo, wo_b, + cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il); cb(cur, "kqv_out", il); @@ -8836,7 +8836,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8873,7 +8873,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(lctx, ctx0, cur, + cur = llm_build_moe_ffn(ctx0, lctx, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -8971,7 +8971,7 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9076,7 +9076,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9197,7 +9197,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9321,7 +9321,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -9353,7 +9353,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(lctx, ctx0, cur, + cur = llm_build_moe_ffn(ctx0, lctx, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -9471,7 +9471,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9494,7 +9494,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "attn_out_norm", il); - cur = llm_build_moe_ffn(lctx, ctx0, cur, + cur = llm_build_moe_ffn(ctx0, lctx, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -9581,7 +9581,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9675,7 +9675,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9970,7 +9970,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10102,13 +10102,13 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } else { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10253,7 +10253,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10372,7 +10372,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10486,7 +10486,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10601,7 +10601,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10624,7 +10624,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); ggml_tensor * moe_out = - llm_build_moe_ffn(lctx, ctx0, cur, + llm_build_moe_ffn(ctx0, lctx, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -10758,7 +10758,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -10878,7 +10878,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -10986,7 +10986,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11088,7 +11088,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11199,7 +11199,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11319,7 +11319,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11437,7 +11437,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11568,7 +11568,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11690,7 +11690,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -11808,7 +11808,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -11945,7 +11945,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12238,7 +12238,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12370,7 +12370,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12497,7 +12497,7 @@ struct llm_build_context { Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); cb(Qcur, "Vcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12606,7 +12606,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12749,7 +12749,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12788,7 +12788,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm_exps", il); - cur = llm_build_moe_ffn(lctx, ctx0, cur, + cur = llm_build_moe_ffn(ctx0, lctx, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -12971,7 +12971,7 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } @@ -13008,7 +13008,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); ggml_tensor * moe_out = - llm_build_moe_ffn(lctx, ctx0, cur, + llm_build_moe_ffn(ctx0, lctx, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -13126,7 +13126,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, NULL, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); @@ -13555,7 +13555,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); } @@ -13668,7 +13668,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); @@ -18578,7 +18578,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { - LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora); + LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { From 9175f4b77c4166b964f5eaffbd2da19a91576b71 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 15 Jul 2024 15:02:46 +0200 Subject: [PATCH 30/33] Apply suggestions from code review Co-authored-by: slaren --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index d047d1a3117d2..f94bee142fcff 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7886,7 +7886,7 @@ static struct ggml_tensor * llm_build_lora_mm( ctx0, lora->b, ggml_mul_mat(ctx0, lora->a, cur) ); - ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale); + ab_cur = ggml_scale(ctx0, ab_cur, scale); res = ggml_add(ctx0, res, ab_cur); } return res; @@ -7913,7 +7913,7 @@ static struct ggml_tensor * llm_build_lora_mm_id( ggml_mul_mat_id(ctx0, lora->a, cur, ids), ids ); - ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale); + ab_cur = ggml_scale(ctx0, ab_cur, scale); res = ggml_add(ctx0, res, ab_cur); } return res; From 0ba23bad6f2169c90b94a605a4d72614821ad7cc Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 15 Jul 2024 15:35:19 +0200 Subject: [PATCH 31/33] change kv metadata --- convert_hf_to_gguf.py | 1 + convert_lora_to_gguf.py | 9 ++++----- gguf-py/gguf/constants.py | 8 ++++++++ gguf-py/gguf/gguf_writer.py | 3 +++ src/llama.cpp | 31 ++++++++++++++++++++----------- 5 files changed, 36 insertions(+), 16 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c5eb7bdbb9bce..a66228d71ed31 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -186,6 +186,7 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", " return new_name def set_gguf_parameters(self): + self.gguf_writer.add_type(gguf.GGUFType.MODEL) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.block_count) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 71d3e57f55720..bfd252d2e7ecd 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -359,17 +359,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter eager=args.no_lazy, model_name=None, ) - logger.info("Set model parameters") - model_instance.set_gguf_parameters() with open(lora_config, "r") as f: lparams: dict[str, Any] = json.load(f) alpha = lparams["lora_alpha"] - model_instance.gguf_writer.add_string("training.type", "finetune_lora") - model_instance.gguf_writer.add_float32("training.lora.alpha", float(alpha)) - + model_instance.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[model_instance.model_arch]) + model_instance.gguf_writer.add_string(gguf.Keys.General.TYPE, gguf.GGUFType.ADAPTER) + model_instance.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") + model_instance.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha)) model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) logger.info("Exporting model...") model_instance.write() diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a95a44237e348..390d2d1890e2a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -19,6 +19,7 @@ class Keys: class General: + TYPE = "general.type" ARCHITECTURE = "general.architecture" QUANTIZATION_VERSION = "general.quantization_version" ALIGNMENT = "general.alignment" @@ -120,10 +121,17 @@ class Tokenizer: MIDDLE_ID = "tokenizer.ggml.middle_token_id" EOT_ID = "tokenizer.ggml.eot_token_id" + class Adapter: + TYPE = "adapter.type" + LORA_ALPHA = "adapter.lora.alpha" + # # recommended mapping of model tensor names for storage in gguf # +class GGUFType: + MODEL = "model" + ADAPTER = "adapter" class MODEL_ARCH(IntEnum): LLAMA = auto() diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index cf95541629032..b0197961d46a8 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -424,6 +424,9 @@ def close(self) -> None: fout.close() self.fout = None + def add_type(self, type_name: str) -> None: + self.add_string(Keys.General.TYPE, type_name) + def add_architecture(self) -> None: self.add_string(Keys.General.ARCHITECTURE, self.arch) diff --git a/src/llama.cpp b/src/llama.cpp index f94bee142fcff..07bb427135d8c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -287,6 +287,7 @@ static const std::map LLM_ARCH_NAMES = { }; enum llm_kv { + LLM_KV_GENERAL_TYPE, LLM_KV_GENERAL_ARCHITECTURE, LLM_KV_GENERAL_QUANTIZATION_VERSION, LLM_KV_GENERAL_ALIGNMENT, @@ -378,11 +379,12 @@ enum llm_kv { LLM_KV_TOKENIZER_MIDDLE_ID, LLM_KV_TOKENIZER_EOT_ID, - LLM_KV_TRAINING_TYPE, - LLM_KV_TRAINING_LORA_ALPHA, + LLM_KV_ADAPTER_TYPE, + LLM_KV_ADAPTER_LORA_ALPHA, }; static const std::map LLM_KV_NAMES = { + { LLM_KV_GENERAL_TYPE, "general.type" }, { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, @@ -474,8 +476,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, - { LLM_KV_TRAINING_TYPE, "training.type" }, - { LLM_KV_TRAINING_LORA_ALPHA, "training.lora.alpha" }, + { LLM_KV_ADAPTER_TYPE, "adapter.type" }, + { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, }; struct LLM_KV { @@ -18596,20 +18598,27 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id); }; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); - auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); - auto lora_arch = llm_arch_from_string(lora_arch_name); - if (lora_arch != model->arch) { + + auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE)); + if (general_type != "adapter") { + gguf_free(ctx_gguf); + throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); + } + + auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); + auto general_arch = llm_arch_from_string(general_arch_str); + if (general_arch != model->arch) { gguf_free(ctx_gguf); throw std::runtime_error("model arch and LoRA arch mismatch"); } - auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE)); - if (train_type != "finetune_lora") { + auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE)); + if (adapter_type != "lora") { gguf_free(ctx_gguf); - throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type); + throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); } - adapter.alpha = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_ALPHA)); + adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); } int n_tensors = gguf_get_n_tensors(ctx_gguf); From b1c40695029f4317a8c1598d1523631b60755b44 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 15 Jul 2024 17:22:38 +0200 Subject: [PATCH 32/33] move add_type to __init__ --- convert_hf_to_gguf.py | 2 +- gguf-py/gguf/constants.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a66228d71ed31..ecf5aa234caca 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -99,6 +99,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) + self.gguf_writer.add_type(gguf.GGUFType.MODEL) @classmethod def __init_subclass__(cls): @@ -186,7 +187,6 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", " return new_name def set_gguf_parameters(self): - self.gguf_writer.add_type(gguf.GGUFType.MODEL) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.block_count) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 390d2d1890e2a..5eb3df706e6e2 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -129,10 +129,12 @@ class Adapter: # recommended mapping of model tensor names for storage in gguf # + class GGUFType: MODEL = "model" ADAPTER = "adapter" + class MODEL_ARCH(IntEnum): LLAMA = auto() FALCON = auto() From d09382fac7ec9eafa8b94c656ec55feb250e4bee Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Mon, 15 Jul 2024 11:39:42 -0400 Subject: [PATCH 33/33] convert_hf : move add_type to main() * convert_lora : use the GGUFWriter from Model instead of overwriting it --- convert_hf_to_gguf.py | 2 +- convert_lora_to_gguf.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ecf5aa234caca..a755b0a60bf0a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -99,7 +99,6 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) - self.gguf_writer.add_type(gguf.GGUFType.MODEL) @classmethod def __init_subclass__(cls): @@ -3575,6 +3574,7 @@ def main() -> None: small_first_shard=args.no_tensor_first_split) logger.info("Set model parameters") + model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL) model_instance.set_gguf_parameters() logger.info("Set model tokenizer") diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index bfd252d2e7ecd..4bb939d45d6bd 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -365,7 +365,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter alpha = lparams["lora_alpha"] - model_instance.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[model_instance.model_arch]) model_instance.gguf_writer.add_string(gguf.Keys.General.TYPE, gguf.GGUFType.ADAPTER) model_instance.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") model_instance.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))