Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for loading RWKV v6 GGUF files #180

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions rwkv_graph.inc
Original file line number Diff line number Diff line change
Expand Up @@ -354,9 +354,9 @@ static struct ggml_tensor * rwkv_att_v5(

state.att_heads = state_out;

// ggml_group_norm considers groups in the third dimension.
x = ggml_reshape_4d(ctx, x, 1, 1, n_embed, sequence_length);
x = rwkv_group_norm_eps_1e_minus5(ctx, x, head_count);
// group norm with head_count groups
x = ggml_reshape_3d(ctx, x, n_embed / head_count, head_count, sequence_length);
x = ggml_norm(ctx, x, 1e-5f);
// Convert back to a regular vector.
x = ggml_reshape_2d(ctx, x, n_embed, sequence_length);
x = ggml_add_inplace(
Expand Down Expand Up @@ -571,9 +571,9 @@ static struct ggml_tensor * rwkv_att_v6(

state.att_heads = state_out;

// ggml_group_norm considers groups in the third dimension.
x = ggml_reshape_4d(ctx, x, 1, 1, n_embed, sequence_length);
x = rwkv_group_norm_eps_64e_minus5(ctx, x, head_count);
// group norm with head_count groups
x = ggml_reshape_3d(ctx, x, n_embed / head_count, head_count, sequence_length);
x = ggml_norm(ctx, x, 64e-5f);
// Convert back to a regular vector.
x = ggml_reshape_2d(ctx, x, n_embed, sequence_length);
x = ggml_add(
Expand Down Expand Up @@ -804,6 +804,10 @@ static bool rwkv_build_serial_graph(struct rwkv_model & model, struct rwkv_compu
ggml_build_forward_expand(graph.cgraph.get(), ggml_cpy(ctx, state.att_bb, output_state.att_bb));
ggml_build_forward_expand(graph.cgraph.get(), ggml_cpy(ctx, state.att_pp, output_state.att_pp));
}

if ((i + 1) % model.rescale_every_n_layers == 0) {
x = ggml_scale(ctx, x, 0.5f);
}
}

graph.pre_logits_nodes = graph.cgraph->n_nodes;
Expand Down Expand Up @@ -957,6 +961,10 @@ static bool rwkv_build_sequential_graph(struct rwkv_model & model, struct rwkv_c
ggml_build_forward_expand(graph.cgraph.get(), ggml_cpy(ctx, state.att_bb, output_state.att_bb));
ggml_build_forward_expand(graph.cgraph.get(), ggml_cpy(ctx, state.att_pp, output_state.att_pp));
}

if ((i + 1) % model.rescale_every_n_layers == 0) {
x = ggml_scale(ctx, x, 0.5f);
}
}

graph.pre_logits_nodes = graph.cgraph->n_nodes;
Expand Down
219 changes: 218 additions & 1 deletion rwkv_model_loading.inc
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ struct rwkv_model {
int64_t head_count;
int64_t head_size;

uint32_t rescale_every_n_layers = 999;

struct ggml_tensor * emb;

struct ggml_tensor * ln0_weight;
Expand Down Expand Up @@ -207,8 +209,81 @@ static bool rwkv_set_params(struct rwkv_model & model, F callback, const uint32_
return true;
}

template<typename F>
static bool rwkv_set_params_gguf(struct rwkv_model & model, F callback, const uint32_t n_gpu_layers) {
const size_t n_gpu = std::min(n_gpu_layers, model.header.n_layer + 1);
bool offload_head = n_gpu == (model.header.n_layer + 1);
bool offload_default = false;

RWKV_ENSURE_OR_FALSE(callback("token_embd.weight", model.emb, offload_default));
RWKV_ENSURE_OR_FALSE(callback("token_embd_norm.weight", model.ln0_weight, (n_gpu_layers > 0)));
RWKV_ENSURE_OR_FALSE(callback("token_embd_norm.bias", model.ln0_bias, (n_gpu_layers > 0)));

uint32_t n_layer = model.header.n_layer;
std::unique_ptr<struct rwkv_layer[]> layers(new(std::nothrow) struct rwkv_layer[n_layer]());
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, layers.get(), "Failed to allocate model layers");
model.layers = std::move(layers);

for (uint32_t i = 0; i < n_layer; i++) {
bool offload_layer = (i < n_gpu);
char buffer[128];
size_t offset = sprintf(buffer, "blk.%" PRId32 ".", i);

rwkv_layer & layer = model.layers[i];
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "attn_norm.weight"), buffer), layer.ln1_weight, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "attn_norm.bias"), buffer), layer.ln1_bias, offload_layer));

if (model.arch_version_major == 6) {
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_lerp_x.weight"), buffer), layer.att_time_maa_x, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_lerp_w.weight"), buffer), layer.att_time_maa_w, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_lerp_k.weight"), buffer), layer.att_time_maa_k, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_lerp_v.weight"), buffer), layer.att_time_maa_v, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_lerp_r.weight"), buffer), layer.att_time_maa_r, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_lerp_g.weight"), buffer), layer.att_time_maa_g, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_w1.weight"), buffer), layer.att_time_maa_w1, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_w2.weight"), buffer), layer.att_time_maa_w2, offload_layer));

// No gpu offloading for wkv yet
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_first.weight"), buffer), layer.att_time_faaaa, offload_default));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_decay.weight"), buffer), layer.att_time_decay, offload_default));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_decay_w1.weight"), buffer), layer.att_time_decay_w1, offload_default));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_decay_w2.weight"), buffer), layer.att_time_decay_w2, offload_default));

RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_key.weight"), buffer), layer.att_key, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_value.weight"), buffer), layer.att_value, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_receptance.weight"), buffer), layer.att_receptance, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_gate.weight"), buffer), layer.att_gate, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_output.weight"), buffer), layer.att_output, offload_layer));

// GroupNorm uses a custom epsilon value, which only has CPU implementation for now.
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_ln.weight"), buffer), layer.att_ln_x_weight, offload_default));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_ln.bias"), buffer), layer.att_ln_x_bias, offload_default));

RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "attn_norm_2.weight"), buffer), layer.ln2_weight, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "attn_norm_2.bias"), buffer), layer.ln2_bias, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "channel_mix_lerp_k.weight"), buffer), layer.ffn_time_maa_k, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "channel_mix_lerp_r.weight"), buffer), layer.ffn_time_maa_r, offload_layer));
} else {
return false;
}

RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "channel_mix_key.weight"), buffer), layer.ffn_key, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "channel_mix_value.weight"), buffer), layer.ffn_value, offload_layer));
RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "channel_mix_receptance.weight"), buffer), layer.ffn_receptance, offload_layer));
}

RWKV_ENSURE_OR_FALSE(callback("output_norm.weight", model.ln_out_weight, offload_head));
RWKV_ENSURE_OR_FALSE(callback("output_norm.bias", model.ln_out_bias, offload_head));
RWKV_ENSURE_OR_FALSE(callback("output.weight", model.head, offload_head));

return true;
}

#include "ggml.h"
#include <iostream>

// Creates a ggml context and loads all parameter tensors from a model file.
static bool rwkv_load_model_from_file(const char * file_path, struct rwkv_model & model, const uint32_t n_gpu_layers) {
static bool rwkv_load_model_from_bin_file(const char * file_path, struct rwkv_model & model, const uint32_t n_gpu_layers) {
struct stat file_stat;

std::unordered_map<std::string, struct ggml_tensor *> parameters;
Expand Down Expand Up @@ -332,3 +407,145 @@ static bool rwkv_load_model_from_file(const char * file_path, struct rwkv_model

return true;
}

static bool rwkv_gguf_get_key_u32(gguf_context * gguf_ctx, const char * key, uint32_t & value) {
int key_id = gguf_find_key(gguf_ctx, key);
if (key_id < 0) {
return false;
}
value = gguf_get_val_u32(gguf_ctx, key_id);
return true;
}

static bool rwkv_load_model_from_gguf_file(const char * file_path, struct rwkv_model & model, const uint32_t n_gpu_layers) {
gguf_context * gguf_ctx = gguf_init_from_file(file_path, {true, &model.ggml_ctx});

// int n_kv = gguf_get_n_kv(gguf_ctx);
int n_tensors = gguf_get_n_tensors(gguf_ctx);
std::string arch = std::string(gguf_get_val_str(gguf_ctx, gguf_find_key(gguf_ctx, "general.architecture")));
if (arch == "rwkv6") {
model.arch_version_major = 6;
model.arch_version_minor = 0;
} else {
// gguf only supports RWKV v6 for now
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL, false, "Unsupported architecture %s", arch.c_str());
}

RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL, rwkv_gguf_get_key_u32(gguf_ctx, (arch + ".embedding_length").c_str(), model.header.n_embed), "Failed to get n_embed");
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL, rwkv_gguf_get_key_u32(gguf_ctx, (arch + ".block_count").c_str(), model.header.n_layer), "Failed to get n_layer");
if (!rwkv_gguf_get_key_u32(gguf_ctx, (arch + ".rescale_every_n_layers").c_str(), model.rescale_every_n_layers)) {
model.rescale_every_n_layers = 999;
}

if (!rwkv_gguf_get_key_u32(gguf_ctx, (arch + ".vocab_size").c_str(), model.header.n_vocab)) {
int key_id = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");
if (key_id >= 0) {
model.header.n_vocab = gguf_get_arr_n(gguf_ctx, key_id);
} else {
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL, false, "Failed to get n_vocab");
}
}

size_t cpu_buffer_size = 0;
size_t gpu_buffer_size = 0;
// Calculate buffer sizes for each backend.
RWKV_ASSERT_NULL(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_PARAM_MISSING, rwkv_set_params_gguf(
model,
[&](const char * key, struct ggml_tensor *& dest, bool offload_gpu) {
struct ggml_tensor * tensor = nullptr, *cur;
for (cur = ggml_get_first_tensor(model.ggml_ctx); cur; cur = ggml_get_next_tensor(model.ggml_ctx, cur)) {
if (strcmp(ggml_get_name(cur), key) == 0) {
tensor = cur;
break;
}
}
RWKV_ENSURE_OR_FALSE_MSG(tensor, "Model parameter %s not found", key);
if (offload_gpu && n_gpu_layers)
gpu_buffer_size += ggml_nbytes(tensor);
else
cpu_buffer_size += ggml_nbytes(tensor);
dest = tensor;
return true;
},
n_gpu_layers
));

cpu_buffer_size += ggml_tensor_overhead() * RWKV_MAX_NODES;
if (n_gpu_layers) {
gpu_buffer_size += ggml_tensor_overhead() * RWKV_MAX_NODES;
}

// Allocate buffers for each backend.
if (n_gpu_layers) {
ggml_backend_t backend_gpu = model.backends.front();
ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_buffer(backend_gpu, gpu_buffer_size);
ggml_backend_buffer_set_usage(gpu_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
model.buffers_w.push_back(gpu_buffer);
model.tallocrs.push_back(ggml_tallocr_new(gpu_buffer));
}

ggml_backend_t backend_cpu = model.backends.back();
ggml_backend_buffer_t cpu_buffer = ggml_backend_alloc_buffer(backend_cpu, cpu_buffer_size);
ggml_backend_buffer_set_usage(cpu_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
model.buffers_w.push_back(cpu_buffer);
model.tallocrs.push_back(ggml_tallocr_new(cpu_buffer));

gguf_free(gguf_ctx);

ggml_context * ggml_ctx;
gguf_ctx = gguf_init_from_file(file_path, {false, &ggml_ctx});

// Allocate tensors in backend buffers.
RWKV_ASSERT_NULL(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_PARAM_MISSING, rwkv_set_params_gguf(
model,
[&](const char * key, struct ggml_tensor *& dest, bool offload_gpu) {
struct ggml_tensor * tensor = nullptr;
struct ggml_tensor * tensor_gguf = nullptr;
struct ggml_tensor * cur;
for (cur = ggml_get_first_tensor(ggml_ctx); cur; cur = ggml_get_next_tensor(ggml_ctx, cur)) {
if (strcmp(ggml_get_name(cur), key) == 0) {
tensor_gguf = cur;
break;
}
}
for (cur = ggml_get_first_tensor(model.ggml_ctx); cur; cur = ggml_get_next_tensor(model.ggml_ctx, cur)) {
if (strcmp(ggml_get_name(cur), key) == 0) {
tensor = cur;
break;
}
}

RWKV_ENSURE_OR_FALSE_MSG(tensor && tensor_gguf, "Model parameter %s not found", key);
ggml_tallocr * alloc = offload_gpu ? &model.tallocrs.front() : &model.tallocrs.back();
ggml_tallocr_alloc(alloc, tensor);
dest = tensor;
ggml_backend_tensor_set(tensor, tensor_gguf->data, 0, ggml_nbytes(tensor));
return true;
},
n_gpu_layers
));

model.head_count = model.layers[0].att_time_faaaa->ne[1];
model.head_size = model.layers[0].ln1_weight->ne[0] / model.head_count;

gguf_free(gguf_ctx);
ggml_free(ggml_ctx);

return true;
}

static bool rwkv_load_model_from_file(const char * file_path, struct rwkv_model & model, const uint32_t n_gpu_layers) {
std::string file_path_str(file_path);
size_t dot_pos = file_path_str.find_last_of('.');
if (dot_pos != std::string::npos) {
std::string extension = file_path_str.substr(dot_pos + 1);
if (extension == "bin") {
return rwkv_load_model_from_bin_file(file_path, model, n_gpu_layers);
} else if (extension == "gguf") {
return rwkv_load_model_from_gguf_file(file_path, model, n_gpu_layers);
}
} else {
// try the legacy format anyway
return rwkv_load_model_from_bin_file(file_path, model, n_gpu_layers);
}
}
94 changes: 0 additions & 94 deletions rwkv_operators.inc
Original file line number Diff line number Diff line change
Expand Up @@ -77,85 +77,6 @@ static void rwkv_max_impl(
SUPPRESS_UNUSED_WARNINGS_IN_CUSTOM_OP();
}

// From ggml.c
static void rwkv_groupnorm_impl(
struct ggml_tensor * dst,
const struct ggml_tensor * src0,
int ith,
int nth,
void * userdata
) {
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_are_same_shape(src0, dst));

GGML_ASSERT(src0->nb[0] == sizeof(float));

GGML_TENSOR_UNARY_OP_LOCALS

const float eps = ((float*)userdata)[0];
const int n_groups = ((int32_t*)userdata)[1];

int n_channels = src0->ne[2];
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
for (int i = ith; i < n_groups; i += nth) {
int start = i * n_channels_per_group;
int end = start + n_channels_per_group;
if (end > n_channels) {
end = n_channels;
}
int step = end - start;

for (int64_t i03 = 0; i03 < ne03; i03++) {
float sum = 0.0;
for (int64_t i02 = start; i02 < end; i02++) {
for (int64_t i01 = 0; i01 < ne01; i01++) {
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);

float sumr = 0.0;
for (int64_t i00 = 0; i00 < ne00; i00++) {
sumr += (float)x[i00];
}
sum += sumr;
}
}
const float mean = sum / (ne00 * ne01 * step);

float sum2 = 0.0;
for (int64_t i02 = start; i02 < end; i02++) {
for (int64_t i01 = 0; i01 < ne01; i01++) {
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);

float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);

float sumr = 0.0;
for (int64_t i00 = 0; i00 < ne00; i00++) {
float v = x[i00] - mean;
y[i00] = v;
sumr += (float)(v * v);
}
sum2 += sumr;
}
}
const float variance = sum2 / (ne00 * ne01 * step);
const float scale = 1.0f / sqrtf(variance + eps);

for (int64_t i02 = start; i02 < end; i02++) {
for (int64_t i01 = 0; i01 < ne01; i01++) {
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
for (int i00 = 0; i00 < ne00; i00++) {
y[i00] *= scale;
}
}
}
}
}

SUPPRESS_UNUSED_WARNINGS_IN_CUSTOM_OP();
}

// Element-wise exp(x)
struct ggml_tensor * rwkv_exp(struct ggml_context * ctx, struct ggml_tensor * x) {
return ggml_map_custom1(ctx, x, rwkv_exp_impl, 1, NULL);
Expand All @@ -171,21 +92,6 @@ struct ggml_tensor * rwkv_max(struct ggml_context * ctx, struct ggml_tensor * x,
return ggml_map_custom2(ctx, x, y, rwkv_max_impl, 1, NULL);
}

// GroupNorm with custom eps value; Remove when ggml_norm supports eps as an argument.
struct ggml_tensor * rwkv_group_norm_eps_1e_minus5(struct ggml_context * ctx, struct ggml_tensor * x, int n_groups) {
static float params[2];
params[0] = 1e-5F;
((int*)params)[1] = n_groups;
return ggml_map_custom1_inplace(ctx, x, rwkv_groupnorm_impl, 1, params);
}

struct ggml_tensor * rwkv_group_norm_eps_64e_minus5(struct ggml_context * ctx, struct ggml_tensor * x, int n_groups) {
static float params[2];
params[0] = 64e-5F;
((int*)params)[1] = n_groups;
return ggml_map_custom1_inplace(ctx, x, rwkv_groupnorm_impl, 1, params);
}

struct ggml_tensor * rwkv_layer_norm(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight, struct ggml_tensor * bias) {
// LayerNorm in RWKV is `x = (x - mean(x)) / sqrt(variance(x) + 1e-5) * weight + bias`
// Looks like ggml_norm does the first part, we only need to apply weight & bias.
Expand Down
Loading