From 7ec24b47db0d0d12c78a3cfa23515b0f30ae280e Mon Sep 17 00:00:00 2001 From: Theia Vogel Date: Sat, 9 Mar 2024 20:22:37 -0800 Subject: [PATCH] control vector api and implementation --- llama.cpp | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ llama.h | 24 ++++++ 2 files changed, 259 insertions(+) diff --git a/llama.cpp b/llama.cpp index b19616e8f9a5fa..6ec671e5de156e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2092,6 +2092,10 @@ struct llama_context { struct ggml_tensor * inp_s_mask; // F32 [kv_size] struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch] + struct llama_control_vector * control_vector; + int32_t control_vector_layer_start; + int32_t control_vector_layer_end; + #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; #endif @@ -5416,6 +5420,8 @@ static struct ggml_tensor * llm_build_kv( return cur; } +ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il); + struct llm_build_context { const llama_model & model; const llama_context & lctx; @@ -5770,6 +5776,14 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + if (lctx.control_vector != nullptr && il >= lctx.control_vector_layer_start && il <= lctx.control_vector_layer_end) { + ggml_tensor * layer_dir = get_control_vector_layer_tensor(lctx.control_vector, il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } + } cb(cur, "l_out", il); // input for next layer @@ -13183,6 +13197,227 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const } } +struct llama_control_vector { + struct ggml_context * ctx; + std::vector tensors; + + llama_control_vector() : ctx(nullptr) {} + + ~llama_control_vector() { + if (this->ctx) { + ggml_free(this->ctx); + } + } +}; + +ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il) { + if (!vector->ctx || il > vector->tensors.size()) { + return nullptr; + } + return vector->tensors[il]; +} + +struct llama_control_vector * llama_control_vector_load(const char * path) { + struct llama_control_vector * vector = new llama_control_vector(); + + int n_tensors; + size_t n_bytes = 0; + uint32_t max_direction_layer = 0; + + // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer + { + struct ggml_init_params meta_params = { + /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ true, + }; + ggml_context * meta_ctx = ggml_init(meta_params); + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ true, + /* .ctx = */ &meta_ctx, + }; + struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path, meta_gguf_params); + if (!meta_ctx_gguf) { + LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__); + ggml_free(meta_ctx); + return nullptr; + } + + n_tensors = gguf_get_n_tensors(meta_ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); + + // split on '.' + size_t dotpos = name.find('.'); + if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { + try { + uint32_t layer = std::stoi(name.substr(dotpos + 1)); + if (layer == 0) { + LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return nullptr; + } + if (layer > max_direction_layer) { + max_direction_layer = layer; + } + } catch (...) { + LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return nullptr; + } + } + + struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); + if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { + LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return nullptr; + } + n_bytes += ggml_nbytes(tensor_meta); + } + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + } + + // load and scale tensors into final control vector context + struct ggml_init_params ggml_params = { + /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ false, + }; + struct ggml_context * ctx = ggml_init(ggml_params); + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(path, params); + if (!ctx_gguf) { + LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__); + ggml_free(ctx); + return nullptr; + } + + vector->ctx = ctx; + vector->tensors.push_back(nullptr); // there's never a direction vector for 0 + for (uint32_t i = 1; i < max_direction_layer; i++) { + std::string name = format("direction.%d", i); + ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + if (tensor) { + vector->tensors.push_back(tensor); + // LLAMA_LOG_INFO("%s: found control vector tensor: t[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(tensor), tensor->name, tensor->data); + } else { + vector->tensors.push_back(nullptr); // as a filler + } + } + + return vector; +} + +struct llama_control_vector * llama_control_vector_dup(const struct llama_control_vector * vector) { + struct llama_control_vector * new_vector = new llama_control_vector(); + if (vector->ctx == nullptr) { + return new_vector; + } + struct ggml_init_params ggml_params = { + /* .mem_size = */ ggml_get_mem_size(vector->ctx), + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ false, + }; + + struct ggml_context * ctx = ggml_init(ggml_params); + + for (ggml_tensor * tensor : vector->tensors) { + if (tensor == nullptr) { + new_vector->tensors.push_back(nullptr); + } else { + ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor); + new_vector->tensors.push_back(new_tensor); + } + } + + new_vector->ctx = ctx; + return new_vector; +} + +int32_t llama_control_vector_scale(struct llama_control_vector * vector, float strength) { + if (vector->ctx == nullptr) { + LLAMA_LOG_ERROR("%s: attempted to scale unloaded control vector\n", __func__); + return 1; + } + + for (ggml_tensor * tensor : vector->tensors) { + if (tensor == nullptr) continue; + for (int j = 0; (int64_t)j < ggml_nelements(tensor); j++) { + float v = ggml_get_f32_1d(tensor, j); + ggml_set_f32_1d(tensor, j, v * strength); + } + } + + return 0; +} + +int32_t llama_control_vector_add(struct llama_control_vector * vector, const struct llama_control_vector * other) { + if (vector->ctx == nullptr || other->ctx == nullptr) { + LLAMA_LOG_ERROR("%s: attempted to add with an unloaded control vector\n", __func__); + return 1; + } + + size_t size = std::max(vector->tensors.size(), other->tensors.size()); + for (size_t i = 0; i < size; i++) { + if (i >= vector->tensors.size()) { + vector->tensors.push_back(nullptr); + } + + ggml_tensor * other_tensor = i < other->tensors.size() ? other->tensors[i] : nullptr; + if (other_tensor != nullptr) { + if (vector->tensors[i] == nullptr) { + ggml_tensor * new_tensor = ggml_dup_tensor(vector->ctx, other_tensor); + vector->tensors[i] = new_tensor; + } else { + ggml_tensor * this_tensor = vector->tensors[i]; + size_t this_nelements = ggml_nelements(this_tensor); + size_t other_nelements = ggml_nelements(other_tensor); + + if (this_nelements != other_nelements) { + LLAMA_LOG_ERROR("%s: attempted to add control vectors of incompatible dimension: %zu != %zu\n", __func__, this_nelements, other_nelements); + return 1; + } + + for (size_t j = 0; j < this_nelements; j++) { + float a = ggml_get_f32_1d(this_tensor, j); + float b = ggml_get_f32_1d(other_tensor, j); + ggml_set_f32_1d(this_tensor, j, a + b); + } + } + } + } + + return 0; +} + +void llama_control_vector_free(struct llama_control_vector * vector) { + delete vector; +} + +void llama_apply_control_vector( + struct llama_context * lctx, + struct llama_control_vector * vector, + int32_t control_vector_layer_start, + int32_t control_vector_layer_end +) { + lctx->control_vector = vector; + lctx->control_vector_layer_start = control_vector_layer_start; + lctx->control_vector_layer_end = control_vector_layer_end; +} + +void llama_clear_control_vector(struct llama_context * lctx) { + lctx->control_vector = nullptr; +} + struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) { struct llama_kv_cache_view result = { /*.n_cells = */ 0, diff --git a/llama.h b/llama.h index 7a107c7f335d56..89ca02e022b847 100644 --- a/llama.h +++ b/llama.h @@ -4,6 +4,7 @@ #include "ggml.h" #include "ggml-backend.h" +#include #include #include #include @@ -436,6 +437,29 @@ extern "C" { float scale, const char * path_base_model, int32_t n_threads); + + struct llama_control_vector; + + LLAMA_API struct llama_control_vector * llama_control_vector_load(const char * path); + LLAMA_API struct llama_control_vector * llama_control_vector_dup(const struct llama_control_vector * vector); + LLAMA_API int32_t llama_control_vector_scale(struct llama_control_vector * vector, float strength); + LLAMA_API int32_t llama_control_vector_add(struct llama_control_vector * vector, const struct llama_control_vector * other); + LLAMA_API void llama_control_vector_free(struct llama_control_vector * vector); + LLAMA_API void llama_apply_control_vector( + struct llama_context * lctx, + struct llama_control_vector * vector, + int32_t control_vector_layer_start, + int32_t control_vector_layer_end); + LLAMA_API void llama_clear_control_vector(struct llama_context * lctx); + + + // Apply a control vector to a model context + LLAMA_API int32_t llama_load_control_vector_from_file( + struct llama_context * lctx, + const char * control_vector_path, + float strength, + int32_t layer_start, + int32_t layer_end); // // KV cache