Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for control vectors #5970

Merged
merged 5 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 215 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,34 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.lora_base = argv[i];
} else if (arg == "--control-vector") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.control_vectors.push_back({ 1.0f, argv[i], });
} else if (arg == "--control-vector-scaled") {
if (++i >= argc) {
invalid_param = true;
break;
}
const char * fname = argv[i];
if (++i >= argc) {
invalid_param = true;
break;
}
params.control_vectors.push_back({ std::stof(argv[i]), fname, });
} else if (arg == "--control-vector-layer-range") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.control_vector_layer_start = std::stoi(argv[i]);
if (++i >= argc) {
invalid_param = true;
break;
}
params.control_vector_layer_end = std::stoi(argv[i]);
} else if (arg == "--mmproj") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -1095,6 +1123,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
printf(" --control-vector FNAME\n");
printf(" add a control vector\n");
printf(" --control-vector-scaled FNAME S\n");
printf(" add a control vector with user defined scaling S\n");
printf(" --control-vector-layer-range START END\n");
Copy link
Contributor

@Azeirah Azeirah Mar 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to embed the scale and layer range parameters in the generated GGUF file too? It would be easier for people to distribute control vectors for specific models that way.

An end-user should still always be able to override them, if this is made possible.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How would we handle the case where the user loads multiple GGUF files with conflicting layer ranges though? 🤔 Since the merged vector must cover a single range. I guess we could only add the layers for a certain vector's range...? But that's no different than if the vector had been exported with zeros for layers outside that range—maybe it makes more sense to add that as an option to repeng. 🤔

printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
printf(" -md FNAME, --model-draft FNAME\n");
Expand Down Expand Up @@ -1360,6 +1394,30 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
return std::make_tuple(nullptr, nullptr);
}

if (!params.control_vectors.empty()) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);

const auto cvec = llama_control_vector_load(params.control_vectors);
if (cvec.n_embd == -1) {
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}

int err = llama_control_vector_apply(lctx,
cvec.data.data(),
cvec.data.size(),
cvec.n_embd,
params.control_vector_layer_start,
params.control_vector_layer_end);
if (err) {
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
}

for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
Expand Down Expand Up @@ -1890,3 +1948,160 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)

return sum / (sqrt(sum1) * sqrt(sum2));
}

//
// Control vector utils
//

static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
int32_t n_tensors;

size_t n_bytes = 0;

uint32_t max_direction_layer = 0;

llama_control_vector_data result = { -1, {} };

// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
{
struct ggml_init_params meta_params = {
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ true,
};
ggml_context * meta_ctx = ggml_init(meta_params);
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
/* .ctx = */ &meta_ctx,
};
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
if (!meta_ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
return result;
}

n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
for (int i = 0; i < n_tensors; i++) {
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);

// split on '.'
size_t dotpos = name.find('.');
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
try {
uint32_t layer = std::stoi(name.substr(dotpos + 1));
if (layer == 0) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
if (layer > max_direction_layer) {
max_direction_layer = layer;
}
} catch (...) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
}

struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
if (result.n_embd == -1) {
result.n_embd = ggml_nelements(tensor_meta);
} else if (ggml_nelements(tensor_meta) != result.n_embd) {
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
n_bytes += ggml_nbytes(tensor_meta);
}
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
}

if (n_tensors == 0) {
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
return result;
}

// load and scale tensors into final control vector context
struct ggml_init_params ggml_params = {
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(ggml_params);

struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
if (!ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
ggml_free(ctx);
return result;
}

// do not store data for layer 0 (it's not used)
result.data.resize(result.n_embd * max_direction_layer);

for (uint32_t il = 1; il <= max_direction_layer; il++) {
const std::string name = "direction." + std::to_string(il);
const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());

float * dst = result.data.data() + result.n_embd * (il - 1);

if (tensor) {
const float * src = (const float *) tensor->data;
for (int j = 0; j < result.n_embd; j++) {
dst[j] = src[j] * load_info.strength;
}
} else {
for (int j = 0; j < result.n_embd; j++) {
dst[j] = 0.0f;
}
}
}

return result;
}

llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
llama_control_vector_data result = { -1, {} };

for (const auto & info : load_infos) {
auto cur = llama_control_vector_load_one(info);

if (cur.n_embd == -1) {
return result;
}
if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
return result;
}

if (result.n_embd == -1) {
result = std::move(cur);
} else {
for (size_t i = 0; i < cur.data.size(); i++) {
result.data[i] += cur.data[i];
}
}
}

if (result.n_embd == -1) {
fprintf(stderr, "%s: no vectors passed\n", __func__);
}

return result;
}
31 changes: 30 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,13 @@ extern char const *LLAMA_COMMIT;
extern char const *LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET;

struct llama_control_vector_load_info;

int32_t get_num_physical_cores();

//
// CLI argument parsing
//
int32_t get_num_physical_cores();

struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
Expand Down Expand Up @@ -103,6 +106,11 @@ struct gpt_params {
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter

std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale

int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector

int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
Expand Down Expand Up @@ -269,3 +277,24 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
void llama_embd_normalize(const float * inp, float * out, int n);

float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);

//
// Control vector utils
//

struct llama_control_vector_data {
int n_embd;

// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
std::vector<float> data;
};

struct llama_control_vector_load_info {
float strength;

std::string fname;
};

// Load control vectors, scale each by strength, and add them together.
// On error, returns {-1, empty}
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
Loading