Skip to content

Commit

Permalink
control vector api and implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
vgel committed Mar 10, 2024
1 parent 621e86b commit 7ec24b4
Show file tree
Hide file tree
Showing 2 changed files with 259 additions and 0 deletions.
235 changes: 235 additions & 0 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2092,6 +2092,10 @@ struct llama_context {
struct ggml_tensor * inp_s_mask; // F32 [kv_size]
struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]

struct llama_control_vector * control_vector;
int32_t control_vector_layer_start;
int32_t control_vector_layer_end;

#ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL;
#endif
Expand Down Expand Up @@ -5416,6 +5420,8 @@ static struct ggml_tensor * llm_build_kv(
return cur;
}

ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il);

struct llm_build_context {
const llama_model & model;
const llama_context & lctx;
Expand Down Expand Up @@ -5770,6 +5776,14 @@ struct llm_build_context {
}

cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);

if (lctx.control_vector != nullptr && il >= lctx.control_vector_layer_start && il <= lctx.control_vector_layer_end) {
ggml_tensor * layer_dir = get_control_vector_layer_tensor(lctx.control_vector, il);
if (layer_dir != nullptr) {
cur = ggml_add(ctx0, cur, layer_dir);
}
}
cb(cur, "l_out", il);

// input for next layer
Expand Down Expand Up @@ -13183,6 +13197,227 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
}
}

struct llama_control_vector {
struct ggml_context * ctx;
std::vector<ggml_tensor*> tensors;

llama_control_vector() : ctx(nullptr) {}

~llama_control_vector() {
if (this->ctx) {
ggml_free(this->ctx);
}
}
};

ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il) {
if (!vector->ctx || il > vector->tensors.size()) {
return nullptr;
}
return vector->tensors[il];
}

struct llama_control_vector * llama_control_vector_load(const char * path) {
struct llama_control_vector * vector = new llama_control_vector();

int n_tensors;
size_t n_bytes = 0;
uint32_t max_direction_layer = 0;

// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
{
struct ggml_init_params meta_params = {
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ true,
};
ggml_context * meta_ctx = ggml_init(meta_params);
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
/* .ctx = */ &meta_ctx,
};
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path, meta_gguf_params);
if (!meta_ctx_gguf) {
LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__);
ggml_free(meta_ctx);
return nullptr;
}

n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
for (int i = 0; i < n_tensors; i++) {
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);

// split on '.'
size_t dotpos = name.find('.');
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
try {
uint32_t layer = std::stoi(name.substr(dotpos + 1));
if (layer == 0) {
LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return nullptr;
}
if (layer > max_direction_layer) {
max_direction_layer = layer;
}
} catch (...) {
LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return nullptr;
}
}

struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return nullptr;
}
n_bytes += ggml_nbytes(tensor_meta);
}
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
}

// load and scale tensors into final control vector context
struct ggml_init_params ggml_params = {
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(ggml_params);

struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(path, params);
if (!ctx_gguf) {
LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__);
ggml_free(ctx);
return nullptr;
}

vector->ctx = ctx;
vector->tensors.push_back(nullptr); // there's never a direction vector for 0
for (uint32_t i = 1; i < max_direction_layer; i++) {
std::string name = format("direction.%d", i);
ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
if (tensor) {
vector->tensors.push_back(tensor);
// LLAMA_LOG_INFO("%s: found control vector tensor: t[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(tensor), tensor->name, tensor->data);
} else {
vector->tensors.push_back(nullptr); // as a filler
}
}

return vector;
}

struct llama_control_vector * llama_control_vector_dup(const struct llama_control_vector * vector) {
struct llama_control_vector * new_vector = new llama_control_vector();
if (vector->ctx == nullptr) {
return new_vector;
}
struct ggml_init_params ggml_params = {
/* .mem_size = */ ggml_get_mem_size(vector->ctx),
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ false,
};

struct ggml_context * ctx = ggml_init(ggml_params);

for (ggml_tensor * tensor : vector->tensors) {
if (tensor == nullptr) {
new_vector->tensors.push_back(nullptr);
} else {
ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
new_vector->tensors.push_back(new_tensor);
}
}

new_vector->ctx = ctx;
return new_vector;
}

int32_t llama_control_vector_scale(struct llama_control_vector * vector, float strength) {
if (vector->ctx == nullptr) {
LLAMA_LOG_ERROR("%s: attempted to scale unloaded control vector\n", __func__);
return 1;
}

for (ggml_tensor * tensor : vector->tensors) {
if (tensor == nullptr) continue;
for (int j = 0; (int64_t)j < ggml_nelements(tensor); j++) {
float v = ggml_get_f32_1d(tensor, j);
ggml_set_f32_1d(tensor, j, v * strength);
}
}

return 0;
}

int32_t llama_control_vector_add(struct llama_control_vector * vector, const struct llama_control_vector * other) {
if (vector->ctx == nullptr || other->ctx == nullptr) {
LLAMA_LOG_ERROR("%s: attempted to add with an unloaded control vector\n", __func__);
return 1;
}

size_t size = std::max(vector->tensors.size(), other->tensors.size());
for (size_t i = 0; i < size; i++) {
if (i >= vector->tensors.size()) {
vector->tensors.push_back(nullptr);
}

ggml_tensor * other_tensor = i < other->tensors.size() ? other->tensors[i] : nullptr;
if (other_tensor != nullptr) {
if (vector->tensors[i] == nullptr) {
ggml_tensor * new_tensor = ggml_dup_tensor(vector->ctx, other_tensor);
vector->tensors[i] = new_tensor;
} else {
ggml_tensor * this_tensor = vector->tensors[i];
size_t this_nelements = ggml_nelements(this_tensor);
size_t other_nelements = ggml_nelements(other_tensor);

if (this_nelements != other_nelements) {
LLAMA_LOG_ERROR("%s: attempted to add control vectors of incompatible dimension: %zu != %zu\n", __func__, this_nelements, other_nelements);
return 1;
}

for (size_t j = 0; j < this_nelements; j++) {
float a = ggml_get_f32_1d(this_tensor, j);
float b = ggml_get_f32_1d(other_tensor, j);
ggml_set_f32_1d(this_tensor, j, a + b);
}
}
}
}

return 0;
}

void llama_control_vector_free(struct llama_control_vector * vector) {
delete vector;
}

void llama_apply_control_vector(
struct llama_context * lctx,
struct llama_control_vector * vector,
int32_t control_vector_layer_start,
int32_t control_vector_layer_end
) {
lctx->control_vector = vector;
lctx->control_vector_layer_start = control_vector_layer_start;
lctx->control_vector_layer_end = control_vector_layer_end;
}

void llama_clear_control_vector(struct llama_context * lctx) {
lctx->control_vector = nullptr;
}

struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
struct llama_kv_cache_view result = {
/*.n_cells = */ 0,
Expand Down
24 changes: 24 additions & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "ggml.h"
#include "ggml-backend.h"

#include <cstdint>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
Expand Down Expand Up @@ -436,6 +437,29 @@ extern "C" {
float scale,
const char * path_base_model,
int32_t n_threads);

struct llama_control_vector;

LLAMA_API struct llama_control_vector * llama_control_vector_load(const char * path);
LLAMA_API struct llama_control_vector * llama_control_vector_dup(const struct llama_control_vector * vector);
LLAMA_API int32_t llama_control_vector_scale(struct llama_control_vector * vector, float strength);
LLAMA_API int32_t llama_control_vector_add(struct llama_control_vector * vector, const struct llama_control_vector * other);
LLAMA_API void llama_control_vector_free(struct llama_control_vector * vector);
LLAMA_API void llama_apply_control_vector(
struct llama_context * lctx,
struct llama_control_vector * vector,
int32_t control_vector_layer_start,
int32_t control_vector_layer_end);
LLAMA_API void llama_clear_control_vector(struct llama_context * lctx);


// Apply a control vector to a model context
LLAMA_API int32_t llama_load_control_vector_from_file(
struct llama_context * lctx,
const char * control_vector_path,
float strength,
int32_t layer_start,
int32_t layer_end);

//
// KV cache
Expand Down

0 comments on commit 7ec24b4

Please sign in to comment.