diff --git a/rwkv_graph.inc b/rwkv_graph.inc index 0dc417e..188d63d 100644 --- a/rwkv_graph.inc +++ b/rwkv_graph.inc @@ -354,9 +354,9 @@ static struct ggml_tensor * rwkv_att_v5( state.att_heads = state_out; - // ggml_group_norm considers groups in the third dimension. - x = ggml_reshape_4d(ctx, x, 1, 1, n_embed, sequence_length); - x = rwkv_group_norm_eps_1e_minus5(ctx, x, head_count); + // group norm with head_count groups + x = ggml_reshape_3d(ctx, x, n_embed / head_count, head_count, sequence_length); + x = ggml_norm(ctx, x, 1e-5f); // Convert back to a regular vector. x = ggml_reshape_2d(ctx, x, n_embed, sequence_length); x = ggml_add_inplace( @@ -571,9 +571,9 @@ static struct ggml_tensor * rwkv_att_v6( state.att_heads = state_out; - // ggml_group_norm considers groups in the third dimension. - x = ggml_reshape_4d(ctx, x, 1, 1, n_embed, sequence_length); - x = rwkv_group_norm_eps_64e_minus5(ctx, x, head_count); + // group norm with head_count groups + x = ggml_reshape_3d(ctx, x, n_embed / head_count, head_count, sequence_length); + x = ggml_norm(ctx, x, 64e-5f); // Convert back to a regular vector. x = ggml_reshape_2d(ctx, x, n_embed, sequence_length); x = ggml_add( @@ -804,6 +804,10 @@ static bool rwkv_build_serial_graph(struct rwkv_model & model, struct rwkv_compu ggml_build_forward_expand(graph.cgraph.get(), ggml_cpy(ctx, state.att_bb, output_state.att_bb)); ggml_build_forward_expand(graph.cgraph.get(), ggml_cpy(ctx, state.att_pp, output_state.att_pp)); } + + if ((i + 1) % model.rescale_every_n_layers == 0) { + x = ggml_scale(ctx, x, 0.5f); + } } graph.pre_logits_nodes = graph.cgraph->n_nodes; @@ -957,6 +961,10 @@ static bool rwkv_build_sequential_graph(struct rwkv_model & model, struct rwkv_c ggml_build_forward_expand(graph.cgraph.get(), ggml_cpy(ctx, state.att_bb, output_state.att_bb)); ggml_build_forward_expand(graph.cgraph.get(), ggml_cpy(ctx, state.att_pp, output_state.att_pp)); } + + if ((i + 1) % model.rescale_every_n_layers == 0) { + x = ggml_scale(ctx, x, 0.5f); + } } graph.pre_logits_nodes = graph.cgraph->n_nodes; diff --git a/rwkv_model_loading.inc b/rwkv_model_loading.inc index ae14c45..39d2dc6 100644 --- a/rwkv_model_loading.inc +++ b/rwkv_model_loading.inc @@ -69,6 +69,8 @@ struct rwkv_model { int64_t head_count; int64_t head_size; + uint32_t rescale_every_n_layers = 999; + struct ggml_tensor * emb; struct ggml_tensor * ln0_weight; @@ -207,8 +209,81 @@ static bool rwkv_set_params(struct rwkv_model & model, F callback, const uint32_ return true; } +template +static bool rwkv_set_params_gguf(struct rwkv_model & model, F callback, const uint32_t n_gpu_layers) { + const size_t n_gpu = std::min(n_gpu_layers, model.header.n_layer + 1); + bool offload_head = n_gpu == (model.header.n_layer + 1); + bool offload_default = false; + + RWKV_ENSURE_OR_FALSE(callback("token_embd.weight", model.emb, offload_default)); + RWKV_ENSURE_OR_FALSE(callback("token_embd_norm.weight", model.ln0_weight, (n_gpu_layers > 0))); + RWKV_ENSURE_OR_FALSE(callback("token_embd_norm.bias", model.ln0_bias, (n_gpu_layers > 0))); + + uint32_t n_layer = model.header.n_layer; + std::unique_ptr layers(new(std::nothrow) struct rwkv_layer[n_layer]()); + RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, layers.get(), "Failed to allocate model layers"); + model.layers = std::move(layers); + + for (uint32_t i = 0; i < n_layer; i++) { + bool offload_layer = (i < n_gpu); + char buffer[128]; + size_t offset = sprintf(buffer, "blk.%" PRId32 ".", i); + + rwkv_layer & layer = model.layers[i]; + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "attn_norm.weight"), buffer), layer.ln1_weight, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "attn_norm.bias"), buffer), layer.ln1_bias, offload_layer)); + + if (model.arch_version_major == 6) { + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_lerp_x.weight"), buffer), layer.att_time_maa_x, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_lerp_w.weight"), buffer), layer.att_time_maa_w, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_lerp_k.weight"), buffer), layer.att_time_maa_k, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_lerp_v.weight"), buffer), layer.att_time_maa_v, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_lerp_r.weight"), buffer), layer.att_time_maa_r, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_lerp_g.weight"), buffer), layer.att_time_maa_g, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_w1.weight"), buffer), layer.att_time_maa_w1, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_w2.weight"), buffer), layer.att_time_maa_w2, offload_layer)); + + // No gpu offloading for wkv yet + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_first.weight"), buffer), layer.att_time_faaaa, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_decay.weight"), buffer), layer.att_time_decay, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_decay_w1.weight"), buffer), layer.att_time_decay_w1, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_decay_w2.weight"), buffer), layer.att_time_decay_w2, offload_default)); + + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_key.weight"), buffer), layer.att_key, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_value.weight"), buffer), layer.att_value, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_receptance.weight"), buffer), layer.att_receptance, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_gate.weight"), buffer), layer.att_gate, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_output.weight"), buffer), layer.att_output, offload_layer)); + + // GroupNorm uses a custom epsilon value, which only has CPU implementation for now. + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_ln.weight"), buffer), layer.att_ln_x_weight, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "time_mix_ln.bias"), buffer), layer.att_ln_x_bias, offload_default)); + + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "attn_norm_2.weight"), buffer), layer.ln2_weight, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "attn_norm_2.bias"), buffer), layer.ln2_bias, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "channel_mix_lerp_k.weight"), buffer), layer.ffn_time_maa_k, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "channel_mix_lerp_r.weight"), buffer), layer.ffn_time_maa_r, offload_layer)); + } else { + return false; + } + + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "channel_mix_key.weight"), buffer), layer.ffn_key, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "channel_mix_value.weight"), buffer), layer.ffn_value, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "channel_mix_receptance.weight"), buffer), layer.ffn_receptance, offload_layer)); + } + + RWKV_ENSURE_OR_FALSE(callback("output_norm.weight", model.ln_out_weight, offload_head)); + RWKV_ENSURE_OR_FALSE(callback("output_norm.bias", model.ln_out_bias, offload_head)); + RWKV_ENSURE_OR_FALSE(callback("output.weight", model.head, offload_head)); + + return true; +} + +#include "ggml.h" +#include + // Creates a ggml context and loads all parameter tensors from a model file. -static bool rwkv_load_model_from_file(const char * file_path, struct rwkv_model & model, const uint32_t n_gpu_layers) { +static bool rwkv_load_model_from_bin_file(const char * file_path, struct rwkv_model & model, const uint32_t n_gpu_layers) { struct stat file_stat; std::unordered_map parameters; @@ -332,3 +407,145 @@ static bool rwkv_load_model_from_file(const char * file_path, struct rwkv_model return true; } + +static bool rwkv_gguf_get_key_u32(gguf_context * gguf_ctx, const char * key, uint32_t & value) { + int key_id = gguf_find_key(gguf_ctx, key); + if (key_id < 0) { + return false; + } + value = gguf_get_val_u32(gguf_ctx, key_id); + return true; +} + +static bool rwkv_load_model_from_gguf_file(const char * file_path, struct rwkv_model & model, const uint32_t n_gpu_layers) { + gguf_context * gguf_ctx = gguf_init_from_file(file_path, {true, &model.ggml_ctx}); + + // int n_kv = gguf_get_n_kv(gguf_ctx); + int n_tensors = gguf_get_n_tensors(gguf_ctx); + std::string arch = std::string(gguf_get_val_str(gguf_ctx, gguf_find_key(gguf_ctx, "general.architecture"))); + if (arch == "rwkv6") { + model.arch_version_major = 6; + model.arch_version_minor = 0; + } else { + // gguf only supports RWKV v6 for now + RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL, false, "Unsupported architecture %s", arch.c_str()); + } + + RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL, rwkv_gguf_get_key_u32(gguf_ctx, (arch + ".embedding_length").c_str(), model.header.n_embed), "Failed to get n_embed"); + RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL, rwkv_gguf_get_key_u32(gguf_ctx, (arch + ".block_count").c_str(), model.header.n_layer), "Failed to get n_layer"); + if (!rwkv_gguf_get_key_u32(gguf_ctx, (arch + ".rescale_every_n_layers").c_str(), model.rescale_every_n_layers)) { + model.rescale_every_n_layers = 999; + } + + if (!rwkv_gguf_get_key_u32(gguf_ctx, (arch + ".vocab_size").c_str(), model.header.n_vocab)) { + int key_id = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens"); + if (key_id >= 0) { + model.header.n_vocab = gguf_get_arr_n(gguf_ctx, key_id); + } else { + RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL, false, "Failed to get n_vocab"); + } + } + + size_t cpu_buffer_size = 0; + size_t gpu_buffer_size = 0; + // Calculate buffer sizes for each backend. + RWKV_ASSERT_NULL(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_PARAM_MISSING, rwkv_set_params_gguf( + model, + [&](const char * key, struct ggml_tensor *& dest, bool offload_gpu) { + struct ggml_tensor * tensor = nullptr, *cur; + for (cur = ggml_get_first_tensor(model.ggml_ctx); cur; cur = ggml_get_next_tensor(model.ggml_ctx, cur)) { + if (strcmp(ggml_get_name(cur), key) == 0) { + tensor = cur; + break; + } + } + RWKV_ENSURE_OR_FALSE_MSG(tensor, "Model parameter %s not found", key); + if (offload_gpu && n_gpu_layers) + gpu_buffer_size += ggml_nbytes(tensor); + else + cpu_buffer_size += ggml_nbytes(tensor); + dest = tensor; + return true; + }, + n_gpu_layers + )); + + cpu_buffer_size += ggml_tensor_overhead() * RWKV_MAX_NODES; + if (n_gpu_layers) { + gpu_buffer_size += ggml_tensor_overhead() * RWKV_MAX_NODES; + } + + // Allocate buffers for each backend. + if (n_gpu_layers) { + ggml_backend_t backend_gpu = model.backends.front(); + ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_buffer(backend_gpu, gpu_buffer_size); + ggml_backend_buffer_set_usage(gpu_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + model.buffers_w.push_back(gpu_buffer); + model.tallocrs.push_back(ggml_tallocr_new(gpu_buffer)); + } + + ggml_backend_t backend_cpu = model.backends.back(); + ggml_backend_buffer_t cpu_buffer = ggml_backend_alloc_buffer(backend_cpu, cpu_buffer_size); + ggml_backend_buffer_set_usage(cpu_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + model.buffers_w.push_back(cpu_buffer); + model.tallocrs.push_back(ggml_tallocr_new(cpu_buffer)); + + gguf_free(gguf_ctx); + + ggml_context * ggml_ctx; + gguf_ctx = gguf_init_from_file(file_path, {false, &ggml_ctx}); + + // Allocate tensors in backend buffers. + RWKV_ASSERT_NULL(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_PARAM_MISSING, rwkv_set_params_gguf( + model, + [&](const char * key, struct ggml_tensor *& dest, bool offload_gpu) { + struct ggml_tensor * tensor = nullptr; + struct ggml_tensor * tensor_gguf = nullptr; + struct ggml_tensor * cur; + for (cur = ggml_get_first_tensor(ggml_ctx); cur; cur = ggml_get_next_tensor(ggml_ctx, cur)) { + if (strcmp(ggml_get_name(cur), key) == 0) { + tensor_gguf = cur; + break; + } + } + for (cur = ggml_get_first_tensor(model.ggml_ctx); cur; cur = ggml_get_next_tensor(model.ggml_ctx, cur)) { + if (strcmp(ggml_get_name(cur), key) == 0) { + tensor = cur; + break; + } + } + + RWKV_ENSURE_OR_FALSE_MSG(tensor && tensor_gguf, "Model parameter %s not found", key); + ggml_tallocr * alloc = offload_gpu ? &model.tallocrs.front() : &model.tallocrs.back(); + ggml_tallocr_alloc(alloc, tensor); + dest = tensor; + ggml_backend_tensor_set(tensor, tensor_gguf->data, 0, ggml_nbytes(tensor)); + return true; + }, + n_gpu_layers + )); + + model.head_count = model.layers[0].att_time_faaaa->ne[1]; + model.head_size = model.layers[0].ln1_weight->ne[0] / model.head_count; + + gguf_free(gguf_ctx); + ggml_free(ggml_ctx); + + return true; +} + +static bool rwkv_load_model_from_file(const char * file_path, struct rwkv_model & model, const uint32_t n_gpu_layers) { + std::string file_path_str(file_path); + size_t dot_pos = file_path_str.find_last_of('.'); + if (dot_pos != std::string::npos) { + std::string extension = file_path_str.substr(dot_pos + 1); + if (extension == "bin") { + return rwkv_load_model_from_bin_file(file_path, model, n_gpu_layers); + } else if (extension == "gguf") { + return rwkv_load_model_from_gguf_file(file_path, model, n_gpu_layers); + } + } else { + // try the legacy format anyway + return rwkv_load_model_from_bin_file(file_path, model, n_gpu_layers); + } +} \ No newline at end of file diff --git a/rwkv_operators.inc b/rwkv_operators.inc index 182a128..008efc4 100644 --- a/rwkv_operators.inc +++ b/rwkv_operators.inc @@ -77,85 +77,6 @@ static void rwkv_max_impl( SUPPRESS_UNUSED_WARNINGS_IN_CUSTOM_OP(); } -// From ggml.c -static void rwkv_groupnorm_impl( - struct ggml_tensor * dst, - const struct ggml_tensor * src0, - int ith, - int nth, - void * userdata -) { - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - GGML_TENSOR_UNARY_OP_LOCALS - - const float eps = ((float*)userdata)[0]; - const int n_groups = ((int32_t*)userdata)[1]; - - int n_channels = src0->ne[2]; - int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; - for (int i = ith; i < n_groups; i += nth) { - int start = i * n_channels_per_group; - int end = start + n_channels_per_group; - if (end > n_channels) { - end = n_channels; - } - int step = end - start; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - float sum = 0.0; - for (int64_t i02 = start; i02 < end; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); - - float sumr = 0.0; - for (int64_t i00 = 0; i00 < ne00; i00++) { - sumr += (float)x[i00]; - } - sum += sumr; - } - } - const float mean = sum / (ne00 * ne01 * step); - - float sum2 = 0.0; - for (int64_t i02 = start; i02 < end; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); - - float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); - - float sumr = 0.0; - for (int64_t i00 = 0; i00 < ne00; i00++) { - float v = x[i00] - mean; - y[i00] = v; - sumr += (float)(v * v); - } - sum2 += sumr; - } - } - const float variance = sum2 / (ne00 * ne01 * step); - const float scale = 1.0f / sqrtf(variance + eps); - - for (int64_t i02 = start; i02 < end; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); - for (int i00 = 0; i00 < ne00; i00++) { - y[i00] *= scale; - } - } - } - } - } - - SUPPRESS_UNUSED_WARNINGS_IN_CUSTOM_OP(); -} - // Element-wise exp(x) struct ggml_tensor * rwkv_exp(struct ggml_context * ctx, struct ggml_tensor * x) { return ggml_map_custom1(ctx, x, rwkv_exp_impl, 1, NULL); @@ -171,21 +92,6 @@ struct ggml_tensor * rwkv_max(struct ggml_context * ctx, struct ggml_tensor * x, return ggml_map_custom2(ctx, x, y, rwkv_max_impl, 1, NULL); } -// GroupNorm with custom eps value; Remove when ggml_norm supports eps as an argument. -struct ggml_tensor * rwkv_group_norm_eps_1e_minus5(struct ggml_context * ctx, struct ggml_tensor * x, int n_groups) { - static float params[2]; - params[0] = 1e-5F; - ((int*)params)[1] = n_groups; - return ggml_map_custom1_inplace(ctx, x, rwkv_groupnorm_impl, 1, params); -} - -struct ggml_tensor * rwkv_group_norm_eps_64e_minus5(struct ggml_context * ctx, struct ggml_tensor * x, int n_groups) { - static float params[2]; - params[0] = 64e-5F; - ((int*)params)[1] = n_groups; - return ggml_map_custom1_inplace(ctx, x, rwkv_groupnorm_impl, 1, params); -} - struct ggml_tensor * rwkv_layer_norm(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight, struct ggml_tensor * bias) { // LayerNorm in RWKV is `x = (x - mean(x)) / sqrt(variance(x) + 1e-5) * weight + bias` // Looks like ggml_norm does the first part, we only need to apply weight & bias.