From 8d2eca3507d9bbf001bc9644424c572bee5c0726 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Wed, 31 Jul 2024 16:05:23 +0800 Subject: [PATCH 01/53] convert_hf_to_gguf: Add support for RWKV v6 Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 73 ++++++++++ gguf-py/gguf/constants.py | 241 ++++++++++++++++++++++----------- gguf-py/gguf/tensor_mapping.py | 100 +++++++++++++- 3 files changed, 332 insertions(+), 82 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index caa41aee5f30b..d109857a20216 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2716,6 +2716,79 @@ class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 +@Model.register("Rwkv6ForCausalLM") +class RwkvModel(Model): + model_arch = gguf.MODEL_ARCH.RWKV + + def set_vocab(self): + assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() + vocab_size = self.hparams.get("vocab_size", 65536) + + tokens: list[bytes] = [''.encode("utf-8")] + toktypes: list[int] = [gguf.TokenType.CONTROL] + + with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: + lines = f.readlines() + for line in lines: + x = eval(line[line.index(' '):line.rindex(' ')]) + x = x.encode("utf-8") if isinstance(x, str) else x + assert isinstance(x, bytes) + assert len(x) == int(line[line.rindex(' '):]) + token_text: str = "" + for b in x: + token_text += f"\\x{b:02x}" + tokens.append(token_text.encode("utf-8")) + toktypes.append(gguf.TokenType.NORMAL) + remainder = vocab_size - len(tokens) + assert remainder >= 0 + for i in range(remainder): + tokens.append(f"".encode("utf-8")) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("rwkv") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_size = self.hparams["head_size"] + hidden_size = self.hparams["hidden_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(0) + self.gguf_writer.add_layer_norm_eps(layer_norm_eps) + self.gguf_writer.add_feed_forward_length(0) # required by llama.cpp + # temporarlily reuse mamba hparams + self.gguf_writer.add_ssm_inner_size(hidden_size) + self.gguf_writer.add_ssm_conv_kernel(3) + self.gguf_writer.add_ssm_state_size(head_size) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + new_name = self.map_tensor_name(name) + + if not (new_name.endswith(".weight") or new_name.endswith(".bias")): + new_name += ".weight" + + if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"): + data_torch = data_torch.transpose(0, 1) + + if new_name.endswith("time_mix_w2.weight"): + data_torch = data_torch.permute(0, 2, 1) + + rescale_every_n_layers = self.hparams["rescale_every"] + if rescale_every_n_layers > 0: + if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): + data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) + + yield (new_name, data_torch) + + @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b55effa9907b1..b6f29ba9ee9ad 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -207,6 +207,7 @@ class MODEL_ARCH(IntEnum): GEMMA = auto() GEMMA2 = auto() STARCODER2 = auto() + RWKV = auto() MAMBA = auto() XVERSE = auto() COMMAND_R = auto() @@ -270,6 +271,29 @@ class MODEL_TENSOR(IntEnum): SSM_A = auto() SSM_D = auto() SSM_OUT = auto() + TIME_MIX_W1 = auto() + TIME_MIX_W2 = auto() + TIME_MIX_LERP_X = auto() + TIME_MIX_LERP_K = auto() + TIME_MIX_LERP_V = auto() + TIME_MIX_LERP_R = auto() + TIME_MIX_LERP_G = auto() + TIME_MIX_LERP_W = auto() + TIME_MIX_FIRST = auto() + TIME_MIX_DECAY = auto() + TIME_MIX_DECAY_W1 = auto() + TIME_MIX_DECAY_W2 = auto() + TIME_MIX_KEY = auto() + TIME_MIX_VALUE = auto() + TIME_MIX_RECEPTANCE = auto() + TIME_MIX_GATE = auto() + TIME_MIX_LN = auto() + TIME_MIX_OUTPUT = auto() + CHANNEL_MIX_LERP_K = auto() + CHANNEL_MIX_LERP_R = auto() + CHANNEL_MIX_KEY = auto() + CHANNEL_MIX_RECEPTANCE = auto() + CHANNEL_MIX_VALUE = auto() ATTN_Q_A = auto() ATTN_Q_B = auto() ATTN_KV_A_MQA = auto() @@ -337,6 +361,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.STARCODER2: "starcoder2", + MODEL_ARCH.RWKV: "rwkv", MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", @@ -355,87 +380,110 @@ class MODEL_TENSOR(IntEnum): } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", - MODEL_TENSOR.TOKEN_TYPES: "token_types", - MODEL_TENSOR.POS_EMBD: "position_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", - MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", - MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", - MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", - MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", - MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", - MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp", - MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", - MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", - MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", - MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", - MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", - MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", - MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", - MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", - MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", - MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", - MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", - MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", - MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", - MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", - MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", - MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", - MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", - MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", - MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", - MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", - MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", - MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", - MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm", - MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm", - MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q", - MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k", - MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v", - MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o", - MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b", - MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm", - MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q", - MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k", - MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v", - MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o", - MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b", - MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm", - MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate", - MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", - MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", - MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", - MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm", - MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q", - MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k", - MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v", - MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o", - MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b", - MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm", - MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate", - MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", - MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", - MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", + MODEL_TENSOR.TOKEN_TYPES: "token_types", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", + MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", + MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", + MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", + MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", + MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp", + MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", + MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", + MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", + MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", + MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", + MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", + MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", + MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", + MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", + MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", + MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", + MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", + MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", + MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", + MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x", + MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k", + MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v", + MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r", + MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g", + MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w", + MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first", + MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay", + MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1", + MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2", + MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key", + MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value", + MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance", + MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate", + MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln", + MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output", + MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k", + MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r", + MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key", + MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance", + MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value", + MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", + MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", + MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", + MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", + MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", + MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", + MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", + MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm", + MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm", + MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q", + MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k", + MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v", + MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o", + MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b", + MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm", + MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q", + MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k", + MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v", + MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o", + MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b", + MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm", + MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate", + MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", + MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", + MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", + MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm", + MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q", + MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k", + MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v", + MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o", + MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b", + MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm", + MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate", + MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", + MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", + MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -856,6 +904,37 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.RWKV: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_NORM_2, + MODEL_TENSOR.TIME_MIX_W1, + MODEL_TENSOR.TIME_MIX_W2, + MODEL_TENSOR.TIME_MIX_LERP_X, + MODEL_TENSOR.TIME_MIX_LERP_K, + MODEL_TENSOR.TIME_MIX_LERP_V, + MODEL_TENSOR.TIME_MIX_LERP_R, + MODEL_TENSOR.TIME_MIX_LERP_G, + MODEL_TENSOR.TIME_MIX_LERP_W, + MODEL_TENSOR.TIME_MIX_FIRST, + MODEL_TENSOR.TIME_MIX_DECAY, + MODEL_TENSOR.TIME_MIX_DECAY_W1, + MODEL_TENSOR.TIME_MIX_DECAY_W2, + MODEL_TENSOR.TIME_MIX_KEY, + MODEL_TENSOR.TIME_MIX_VALUE, + MODEL_TENSOR.TIME_MIX_RECEPTANCE, + MODEL_TENSOR.TIME_MIX_GATE, + MODEL_TENSOR.TIME_MIX_LN, + MODEL_TENSOR.TIME_MIX_OUTPUT, + MODEL_TENSOR.CHANNEL_MIX_LERP_K, + MODEL_TENSOR.CHANNEL_MIX_LERP_R, + MODEL_TENSOR.CHANNEL_MIX_KEY, + MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE, + MODEL_TENSOR.CHANNEL_MIX_VALUE, + ], MODEL_ARCH.MAMBA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a4f185c0658a3..bc9a13ee5bdf5 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -27,6 +27,7 @@ class TensorNameMap: "embedding.word_embeddings", # chatglm "transformer.token_embeddings", # openelm "shared", # t5 + "rwkv.embeddings", # rwkv ), # Token type embeddings @@ -40,6 +41,7 @@ class TensorNameMap: "embeddings.LayerNorm", # bert "emb_ln", # nomic-bert "transformer.norm", # openelm + "rwkv.blocks.0.pre_ln", # rwkv ), # Position embeddings @@ -57,6 +59,7 @@ class TensorNameMap: "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 "output_layer", # chatglm + "head", # rwkv ), # Output norm @@ -76,6 +79,7 @@ class TensorNameMap: "encoder.final_layernorm", # chatglm "transformer.norm", # openelm "model.norm", # nemotron + "rwkv.ln_out", # rwkv ), # Rope frequencies @@ -108,12 +112,14 @@ class TensorNameMap: "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx "encoder.layers.{bid}.input_layernorm", # chatglm "transformer.layers.{bid}.attn_norm", # openelm + "rwkv.blocks.{bid}.ln1", # rwkv ), # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( - "transformer.h.{bid}.ln_attn", # falcon40b + "transformer.h.{bid}.ln_attn", # falcon40b "encoder.layer.{bid}.layer_norm_1", # jina-v2-code + "rwkv.blocks.{bid}.ln2", # rwkv ), # Attention query-key-value @@ -434,6 +440,98 @@ class TensorNameMap: "backbone.layers.{bid}.mixer.out_proj", ), + MODEL_TENSOR.TIME_MIX_W1: ( + "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6 + ), + + MODEL_TENSOR.TIME_MIX_W2: ( + "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6 + ), + + MODEL_TENSOR.TIME_MIX_LERP_X: ( + "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6 + ), + + MODEL_TENSOR.TIME_MIX_LERP_K: ( + "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6 + ), + + MODEL_TENSOR.TIME_MIX_LERP_V: ( + "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6 + ), + + MODEL_TENSOR.TIME_MIX_LERP_R: ( + "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6 + ), + + MODEL_TENSOR.TIME_MIX_LERP_G: ( + "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6 + ), + + MODEL_TENSOR.TIME_MIX_LERP_W: ( + "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6 + ), + + MODEL_TENSOR.TIME_MIX_FIRST: ( + "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6 + ), + + MODEL_TENSOR.TIME_MIX_DECAY: ( + "rwkv.blocks.{bid}.attention.time_decay", # rwkv v6 + ), + + MODEL_TENSOR.TIME_MIX_DECAY_W1: ( + "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6 + ), + + MODEL_TENSOR.TIME_MIX_DECAY_W2: ( + "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6 + ), + + MODEL_TENSOR.TIME_MIX_KEY: ( + "rwkv.blocks.{bid}.attention.key", # rwkv + ), + + MODEL_TENSOR.TIME_MIX_VALUE: ( + "rwkv.blocks.{bid}.attention.value", # rwkv + ), + + MODEL_TENSOR.TIME_MIX_RECEPTANCE: ( + "rwkv.blocks.{bid}.attention.receptance", # rwkv + ), + + MODEL_TENSOR.TIME_MIX_GATE: ( + "rwkv.blocks.{bid}.attention.gate", # rwkv + ), + + MODEL_TENSOR.TIME_MIX_LN: ( + "rwkv.blocks.{bid}.attention.ln_x", # rwkv + ), + + MODEL_TENSOR.TIME_MIX_OUTPUT: ( + "rwkv.blocks.{bid}.attention.output", # rwkv + ), + + MODEL_TENSOR.CHANNEL_MIX_LERP_K: ( + "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6 + ), + + MODEL_TENSOR.CHANNEL_MIX_LERP_R: ( + "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6 + ), + + MODEL_TENSOR.CHANNEL_MIX_KEY: ( + "rwkv.blocks.{bid}.feed_forward.key", # rwkv + ), + + MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: ( + "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv + ), + + MODEL_TENSOR.CHANNEL_MIX_VALUE: ( + "rwkv.blocks.{bid}.feed_forward.value", # rwkv + ), + MODEL_TENSOR.ATTN_Q_A: ( "model.layers.{bid}.self_attn.q_a_proj", # deepseek2 ), From dc0767f4b3eaf23eae4a44248ac01b1697f64334 Mon Sep 17 00:00:00 2001 From: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Date: Thu, 4 Apr 2024 13:59:37 +0200 Subject: [PATCH 02/53] Add RWKV tokenization --- include/llama.h | 1 + src/llama-vocab.cpp | 126 ++++++++++++++++++++++++++++++++++++++++++++ src/llama.cpp | 20 +++++++ 3 files changed, 147 insertions(+) diff --git a/include/llama.h b/include/llama.h index 6cca6320b347d..53ff6e535c8a8 100644 --- a/include/llama.h +++ b/include/llama.h @@ -66,6 +66,7 @@ extern "C" { LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram + LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization }; // pre-tokenization types diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 323660ef54cb0..30db1a04255c8 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1097,6 +1097,104 @@ struct llm_tokenizer_ugm { struct naive_trie token_matcher; }; +// +// RWKV tokenizer +// + +static std::vector llama_unescape_rwkv_token(const std::string & escaped) { + std::vector output; + + // Parser state + bool escaping = false; + uint8_t hex_remaining = 0; + uint8_t hex_acc = 0; + + // Step through characters, performing parsing + for (const char & c : escaped) { + // If we're parsing a hex code, interpret the next character + if (hex_remaining != 0) { + uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0'); + hex_acc = (hex_acc << 4) + value; + + hex_remaining -= 1; + if (hex_remaining == 0) { + output.push_back(hex_acc); + hex_acc = 0; + } + + continue; + } + + // If we got an escape character, interpret it + if (escaping) { + if (c == 't') { + output.push_back('\t'); + } else if (c == 'n') { + output.push_back('\n'); + } else if (c == 'r') { + output.push_back('\r'); + } else if (c == 'x') { + hex_remaining = 2; + } else { + output.push_back(c); + } + + escaping = false; + continue; + } + + if (c == '\\') { + escaping = true; + continue; + } + + output.push_back(c); + } + + return output; +} + +struct llm_tokenizer_rwkv { + llm_tokenizer_rwkv(const llama_vocab & vocab): vocab(vocab) { + // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens. + // For now, we decode the vocab here into the lookup we'll use for tokenization. + for (const auto & token : vocab.id_to_token) { + auto data = llama_unescape_rwkv_token(token.text); + tokens.push_back(data); + } + } + + void tokenize(const std::string & text, std::vector & output) { + uint32_t position = 0; + + while (position < text.size()) { + // Iterate through possible tokens backwards, starting with the largest + for (int32_t i = (int32_t)tokens.size() - 1; i >= 0; i--) { + uint32_t token_size = tokens[i].size(); + + // If there's not enough left for this token + if (text.size() - position < token_size) { + continue; + } + + // If the token doesn't match the data + if (std::memcmp(text.data() + position, tokens[i].data(), token_size) != 0) { + continue; + } + + // Add the token and advance + output.push_back(i); + position += token_size; + break; + } + } + } + + const llama_vocab & vocab; + + std::vector> tokens; +}; + // // (de-) tokenize // @@ -1401,6 +1499,23 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, output.push_back(vocab.special_eos_id); } } break; + case LLAMA_VOCAB_TYPE_RWKV: + { + for (const auto & fragment : fragment_buffer) { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { + auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + +#ifdef PRETOKENIZERDEBUG + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); +#endif + + llm_tokenizer_rwkv tokenizer(vocab); + tokenizer.tokenize(raw_text, output); + } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) + output.push_back(fragment.token); + } + } + } break; case LLAMA_VOCAB_TYPE_NONE: GGML_ABORT("fatal error"); } @@ -1616,6 +1731,17 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token } break; } + case LLAMA_VOCAB_TYPE_RWKV: { + std::vector result = llama_unescape_rwkv_token(token_text); + + // If we don't have enough space, return an error + if (result.size() > (size_t)length) { + return -(int)result.size(); + } + + memcpy(buf, result.data(), result.size()); + return (int)result.size(); + } default: GGML_ABORT("fatal error"); } diff --git a/src/llama.cpp b/src/llama.cpp index 8d5f24783d6ab..799c6059d5124 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -212,6 +212,7 @@ enum llm_arch { LLM_ARCH_JAIS, LLM_ARCH_NEMOTRON, LLM_ARCH_EXAONE, + LLM_ARCH_RWKV, LLM_ARCH_UNKNOWN, }; @@ -259,6 +260,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_EXAONE, "exaone" }, + { LLM_ARCH_RWKV, "rwkv" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1339,6 +1341,12 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_RWKV, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -5919,6 +5927,16 @@ static void llm_load_vocab( } #endif } + } else if (tokenizer_name == "rwkv") { + vocab.type = LLAMA_VOCAB_TYPE_RWKV; + + // default special tokens + vocab.special_bos_id = 0; + vocab.special_eos_id = 0; + vocab.special_unk_id = -1; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + vocab.add_space_prefix = false; } else { throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str())); } @@ -17955,6 +17973,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_T5: case LLM_ARCH_T5ENCODER: case LLM_ARCH_JAIS: + case LLM_ARCH_RWKV: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values @@ -18123,6 +18142,7 @@ llama_token llama_model_decoder_start_token(const struct llama_model * model) { bool llama_model_is_recurrent(const struct llama_model * model) { switch (model->arch) { case LLM_ARCH_MAMBA: return true; + case LLM_ARCH_RWKV: return true; default: return false; } } From 865167d01a23e6e1da7e1689b10ada36ffd0f533 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Wed, 31 Jul 2024 22:16:22 +0800 Subject: [PATCH 03/53] Fix build Signed-off-by: Molly Sophia --- src/llama.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 799c6059d5124..b19b1cce4a8fa 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5927,7 +5927,7 @@ static void llm_load_vocab( } #endif } - } else if (tokenizer_name == "rwkv") { + } else if (tokenizer_model == "rwkv") { vocab.type = LLAMA_VOCAB_TYPE_RWKV; // default special tokens @@ -5936,7 +5936,6 @@ static void llm_load_vocab( vocab.special_unk_id = -1; vocab.special_sep_id = -1; vocab.special_pad_id = -1; - vocab.add_space_prefix = false; } else { throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str())); } @@ -6068,6 +6067,12 @@ static void llm_load_vocab( vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.tokenizer_add_bos = false; vocab.tokenizer_add_eos = true; + } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + vocab.tokenizer_add_space_prefix = false; + vocab.tokenizer_clean_spaces = false; + vocab.tokenizer_add_bos = true; + vocab.tokenizer_add_eos = false; } else { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } From 7cac72a80b5314e2bacab7971acd337f41863088 Mon Sep 17 00:00:00 2001 From: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Date: Fri, 12 Apr 2024 16:28:54 +0200 Subject: [PATCH 04/53] Do not use special tokens when matching in RWKV tokenizer --- src/llama-vocab.cpp | 5 +++++ src/llama.cpp | 8 ++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 30db1a04255c8..0c9e57215254b 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1170,6 +1170,11 @@ struct llm_tokenizer_rwkv { while (position < text.size()) { // Iterate through possible tokens backwards, starting with the largest for (int32_t i = (int32_t)tokens.size() - 1; i >= 0; i--) { + // Skip tokens that aren't normal type, we can't match on those + if (vocab.id_to_token[i].attr != LLAMA_TOKEN_TYPE_NORMAL) { + continue; + } + uint32_t token_size = tokens[i].size(); // If there's not enough left for this token diff --git a/src/llama.cpp b/src/llama.cpp index b19b1cce4a8fa..50a7d5ff3ad27 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5931,8 +5931,8 @@ static void llm_load_vocab( vocab.type = LLAMA_VOCAB_TYPE_RWKV; // default special tokens - vocab.special_bos_id = 0; - vocab.special_eos_id = 0; + vocab.special_bos_id = -1; + vocab.special_eos_id = -1; vocab.special_unk_id = -1; vocab.special_sep_id = -1; vocab.special_pad_id = -1; @@ -8223,6 +8223,10 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_RWKV: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + } default: throw std::runtime_error("unknown architecture"); } From e92c74f4a1b88a0065b7fe1faabbc4ccfa579a47 Mon Sep 17 00:00:00 2001 From: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Date: Mon, 15 Apr 2024 12:05:47 +0200 Subject: [PATCH 05/53] Fix model loading --- src/llama.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 50a7d5ff3ad27..195abba77cd8b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1345,6 +1345,7 @@ static const std::map> LLM_TENSOR_NA LLM_ARCH_RWKV, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, }, }, { @@ -8226,6 +8227,16 @@ static bool llm_load_tensors( case LLM_ARCH_RWKV: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + } + } default: throw std::runtime_error("unknown architecture"); From a0aae8d671f3aa704dd2faf7dcd1e0a689ad73c7 Mon Sep 17 00:00:00 2001 From: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Date: Wed, 17 Apr 2024 14:59:18 +0200 Subject: [PATCH 06/53] Add (broken) placeholder graph builder for RWKV --- src/llama.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 195abba77cd8b..ce2f87ef956ee 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -14718,6 +14718,22 @@ struct llm_build_context { return gf; } + + ggml_cgraph * build_rwkv() { + ggml_cgraph *gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + // Input embeddings, start of the model after tokenizing ({n_embd, n_tokens}) + ggml_tensor *input_embeddings = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // Dummy operation, just to copy, we're not doing anything with it right now + ggml_tensor *output = ggml_scale(ctx0, input_embeddings, 1.0); + + // Mark the output as being the result + cb(output, "result_output", -1); + ggml_build_forward_expand(gf, output); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -14964,6 +14980,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_exaone(); } break; + case LLM_ARCH_RWKV: + { + result = llm.build_rwkv(); + } break; default: GGML_ABORT("fatal error"); } From a8667896034b2d7d99e033504d6da9820830741c Mon Sep 17 00:00:00 2001 From: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Date: Fri, 19 Apr 2024 10:06:00 +0200 Subject: [PATCH 07/53] Add workaround for kv cache --- src/llama.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index ce2f87ef956ee..2c97a97f7585c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5799,6 +5799,12 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_RWKV: + { + // TODO: Re-using mamba keys right now, but RWKV isn't state-space + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + } break; default: (void)0; } From 4e23d9715bce337165e279eed873ed43bdbcc7c9 Mon Sep 17 00:00:00 2001 From: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Date: Tue, 23 Apr 2024 11:12:09 +0200 Subject: [PATCH 08/53] Add logits conversion to rwkv5 --- src/llama.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 2c97a97f7585c..589b801967176 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1345,6 +1345,8 @@ static const std::map> LLM_TENSOR_NA LLM_ARCH_RWKV, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, }, }, @@ -5801,6 +5803,8 @@ static void llm_load_hparams( } break; case LLM_ARCH_RWKV: { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + // TODO: Re-using mamba keys right now, but RWKV isn't state-space ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); @@ -8234,6 +8238,13 @@ static bool llm_load_tensors( { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + } + for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); @@ -14734,6 +14745,14 @@ struct llm_build_context { // Dummy operation, just to copy, we're not doing anything with it right now ggml_tensor *output = ggml_scale(ctx0, input_embeddings, 1.0); + // Something related to skipping tokens, specifics unclear + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + output = ggml_get_rows(ctx0, output, inp_out_ids); + + // Output head, convert result vector to logits + output = llm_build_norm(ctx0, output, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); + output = ggml_mul_mat(ctx0, model.output, output); + // Mark the output as being the result cb(output, "result_output", -1); ggml_build_forward_expand(gf, output); From 54795885697fdc79b9d6d96531378854a85fa6fa Mon Sep 17 00:00:00 2001 From: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:48:24 +0200 Subject: [PATCH 09/53] Add rwkv5 layer norms --- src/llama.cpp | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 589b801967176..1c595e1e121ac 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1345,9 +1345,11 @@ static const std::map> LLM_TENSOR_NA LLM_ARCH_RWKV, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, }, }, { @@ -8238,6 +8240,10 @@ static bool llm_load_tensors( { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // Block 0, LN0 + model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); + model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); + // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); @@ -8252,6 +8258,9 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); + layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); } } @@ -14740,22 +14749,30 @@ struct llm_build_context { ggml_cgraph *gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); // Input embeddings, start of the model after tokenizing ({n_embd, n_tokens}) - ggml_tensor *input_embeddings = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + ggml_tensor * input_embeddings = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - // Dummy operation, just to copy, we're not doing anything with it right now - ggml_tensor *output = ggml_scale(ctx0, input_embeddings, 1.0); + // x = self.layer_norm(x, self.w.blocks[0].ln0) + ggml_tensor * current = llm_build_norm(ctx0, input_embeddings, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); + + for (int layer_i = 0; layer_i < n_layer; ++layer_i) { + const llama_layer * layer = &model.layers[layer_i]; + + current = llm_build_norm(ctx0, current, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, -1); + + current = llm_build_norm(ctx0, current, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, -1); + } // Something related to skipping tokens, specifics unclear - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - output = ggml_get_rows(ctx0, output, inp_out_ids); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + current = ggml_get_rows(ctx0, current, inp_out_ids); // Output head, convert result vector to logits - output = llm_build_norm(ctx0, output, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); - output = ggml_mul_mat(ctx0, model.output, output); + current = llm_build_norm(ctx0, current, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); + current = ggml_mul_mat(ctx0, model.output, current); // Mark the output as being the result - cb(output, "result_output", -1); - ggml_build_forward_expand(gf, output); + cb(current, "result_output", -1); + ggml_build_forward_expand(gf, current); return gf; } From dd3aa3d40e858f886ab135c6f3aef1b12703f526 Mon Sep 17 00:00:00 2001 From: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Date: Mon, 6 May 2024 15:31:56 +0200 Subject: [PATCH 10/53] Add time mix KVRG & correct merge mistake --- src/llama.cpp | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 1c595e1e121ac..174177775cd69 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -520,6 +520,10 @@ enum llm_tensor { LLM_TENSOR_SSM_A, LLM_TENSOR_SSM_D, LLM_TENSOR_SSM_OUT, + LLM_TENSOR_TIME_MIX_K, + LLM_TENSOR_TIME_MIX_V, + LLM_TENSOR_TIME_MIX_R, + LLM_TENSOR_TIME_MIX_G, LLM_TENSOR_ATTN_Q_A, LLM_TENSOR_ATTN_Q_B, LLM_TENSOR_ATTN_KV_A_MQA, @@ -1350,6 +1354,10 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, + { LLM_TENSOR_TIME_MIX_K, "blk.%d.time_mix_k" }, + { LLM_TENSOR_TIME_MIX_V, "blk.%d.time_mix_v" }, + { LLM_TENSOR_TIME_MIX_R, "blk.%d.time_mix_r" }, + { LLM_TENSOR_TIME_MIX_G, "blk.%d.time_mix_g" }, }, }, { @@ -2514,6 +2522,12 @@ struct llama_layer { struct ggml_tensor * ssm_conv1d_b; struct ggml_tensor * ssm_dt_b; + // rwkv + struct ggml_tensor * time_mix_k; + struct ggml_tensor * time_mix_v; + struct ggml_tensor * time_mix_r; + struct ggml_tensor * time_mix_g; + // long rope factors struct ggml_tensor * rope_long = nullptr; struct ggml_tensor * rope_short = nullptr; @@ -8245,11 +8259,9 @@ static bool llm_load_tensors( model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // output - { - model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - } + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); @@ -8261,6 +8273,11 @@ static bool llm_load_tensors( layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); + + layer.time_mix_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_K, "weight", i), {n_embd, 1, 1}); + layer.time_mix_v = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_V, "weight", i), {n_embd, 1, 1}); + layer.time_mix_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_R, "weight", i), {n_embd, 1, 1}); + layer.time_mix_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_G, "weight", i), {n_embd, 1, 1}); } } From b409fd8e117f5576c4942485d243ef41570ea56b Mon Sep 17 00:00:00 2001 From: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Date: Mon, 13 May 2024 13:32:41 +0200 Subject: [PATCH 11/53] Add remaining time mix parameters --- src/llama.cpp | 79 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 22 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 174177775cd69..287a365203917 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -520,10 +520,17 @@ enum llm_tensor { LLM_TENSOR_SSM_A, LLM_TENSOR_SSM_D, LLM_TENSOR_SSM_OUT, - LLM_TENSOR_TIME_MIX_K, - LLM_TENSOR_TIME_MIX_V, - LLM_TENSOR_TIME_MIX_R, - LLM_TENSOR_TIME_MIX_G, + LLM_TENSOR_TIME_MIX_LERP_K, + LLM_TENSOR_TIME_MIX_LERP_V, + LLM_TENSOR_TIME_MIX_LERP_R, + LLM_TENSOR_TIME_MIX_LERP_G, + LLM_TENSOR_TIME_MIX_FIRST, + LLM_TENSOR_TIME_MIX_DECAY, + LLM_TENSOR_TIME_MIX_KEY, + LLM_TENSOR_TIME_MIX_VALUE, + LLM_TENSOR_TIME_MIX_RECEPTANCE, + LLM_TENSOR_TIME_MIX_GATE, + LLM_TENSOR_TIME_MIX_LN, LLM_TENSOR_ATTN_Q_A, LLM_TENSOR_ATTN_Q_B, LLM_TENSOR_ATTN_KV_A_MQA, @@ -1348,16 +1355,23 @@ static const std::map> LLM_TENSOR_NA { LLM_ARCH_RWKV, { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, - { LLM_TENSOR_TIME_MIX_K, "blk.%d.time_mix_k" }, - { LLM_TENSOR_TIME_MIX_V, "blk.%d.time_mix_v" }, - { LLM_TENSOR_TIME_MIX_R, "blk.%d.time_mix_r" }, - { LLM_TENSOR_TIME_MIX_G, "blk.%d.time_mix_g" }, + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, + { LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix.lerp_k" }, + { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix.lerp_v" }, + { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix.lerp_r" }, + { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix.lerp_g" }, + { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix.first" }, + { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix.decay" }, + { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix.key" }, + { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix.value" }, + { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix.receptance" }, + { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix.gate" }, + { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix.ln" }, }, }, { @@ -2523,10 +2537,20 @@ struct llama_layer { struct ggml_tensor * ssm_dt_b; // rwkv - struct ggml_tensor * time_mix_k; - struct ggml_tensor * time_mix_v; - struct ggml_tensor * time_mix_r; - struct ggml_tensor * time_mix_g; + struct ggml_tensor * time_mix_lerp_k; + struct ggml_tensor * time_mix_lerp_v; + struct ggml_tensor * time_mix_lerp_r; + struct ggml_tensor * time_mix_lerp_g; + + struct ggml_tensor * time_mix_first; + struct ggml_tensor * time_mix_decay; + struct ggml_tensor * time_mix_key; + struct ggml_tensor * time_mix_value; + struct ggml_tensor * time_mix_receptance; + struct ggml_tensor * time_mix_gate; + + struct ggml_tensor * time_mix_ln; + struct ggml_tensor * time_mix_ln_b; // long rope factors struct ggml_tensor * rope_long = nullptr; @@ -8274,10 +8298,21 @@ static bool llm_load_tensors( layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); - layer.time_mix_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_K, "weight", i), {n_embd, 1, 1}); - layer.time_mix_v = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_V, "weight", i), {n_embd, 1, 1}); - layer.time_mix_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_R, "weight", i), {n_embd, 1, 1}); - layer.time_mix_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_G, "weight", i), {n_embd, 1, 1}); + layer.time_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}); + layer.time_mix_lerp_v = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}); + layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}); + layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}); + + // TODO: Parametrize hardcoded dimensions for first & decay + layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {64, 32}); + layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {64, 32}); + layer.time_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, n_embd}); + layer.time_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, n_embd}); + layer.time_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}); + layer.time_mix_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {n_embd, n_embd}); + + layer.time_mix_ln = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}); + layer.time_mix_ln_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}); } } From 3cbeffc50fc2a61ed71e67ac86b96f073eabc2f2 Mon Sep 17 00:00:00 2001 From: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Date: Mon, 13 May 2024 14:39:50 +0200 Subject: [PATCH 12/53] Add time mix output loading --- src/llama.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 287a365203917..a878980f8d2de 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -531,6 +531,7 @@ enum llm_tensor { LLM_TENSOR_TIME_MIX_RECEPTANCE, LLM_TENSOR_TIME_MIX_GATE, LLM_TENSOR_TIME_MIX_LN, + LLM_TENSOR_TIME_MIX_OUTPUT, LLM_TENSOR_ATTN_Q_A, LLM_TENSOR_ATTN_Q_B, LLM_TENSOR_ATTN_KV_A_MQA, @@ -1372,6 +1373,7 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix.receptance" }, { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix.gate" }, { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix.ln" }, + { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix.output" }, }, }, { @@ -2551,6 +2553,7 @@ struct llama_layer { struct ggml_tensor * time_mix_ln; struct ggml_tensor * time_mix_ln_b; + struct ggml_tensor * time_mix_output; // long rope factors struct ggml_tensor * rope_long = nullptr; @@ -8313,6 +8316,7 @@ static bool llm_load_tensors( layer.time_mix_ln = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}); layer.time_mix_ln_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}); + layer.time_mix_output = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, n_embd}); } } From b3b17e05fe28330bd5fed5d4b13d73f96d170851 Mon Sep 17 00:00:00 2001 From: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Date: Wed, 15 May 2024 01:19:44 +0200 Subject: [PATCH 13/53] Add placeholder llm_build_time_mix --- src/llama.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index a878980f8d2de..83964eb2c325b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9279,6 +9279,15 @@ static struct ggml_tensor * llm_build_mamba( return cur; } +static struct ggml_tensor * llm_build_time_mix( + struct ggml_context * ctx, + const struct llama_layer * layer, + struct ggml_tensor * current, + int layer_i) { + + return current; +} + struct llm_build_context { const llama_model & model; llama_context & lctx; @@ -14813,9 +14822,10 @@ struct llm_build_context { for (int layer_i = 0; layer_i < n_layer; ++layer_i) { const llama_layer * layer = &model.layers[layer_i]; - current = llm_build_norm(ctx0, current, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, -1); + current = llm_build_norm(ctx0, current, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, layer_i); + current = llm_build_time_mix(ctx0, layer, current, layer_i); - current = llm_build_norm(ctx0, current, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, -1); + current = llm_build_norm(ctx0, current, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i); } // Something related to skipping tokens, specifics unclear From 700dad1b8694aa629acea08a2e8f97f4c73a3f9b Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Thu, 1 Aug 2024 12:51:29 +0800 Subject: [PATCH 14/53] Fix build Signed-off-by: Molly Sophia --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 83964eb2c325b..184b0870af1f3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -14811,7 +14811,7 @@ struct llm_build_context { } ggml_cgraph * build_rwkv() { - ggml_cgraph *gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + ggml_cgraph *gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // Input embeddings, start of the model after tokenizing ({n_embd, n_tokens}) ggml_tensor * input_embeddings = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); From a180b63b49dfbf1761f98f9dd73cfc32eb504624 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Thu, 1 Aug 2024 21:45:02 +0800 Subject: [PATCH 15/53] Load more tensors for rwkv v6 Signed-off-by: Molly Sophia --- src/llama.cpp | 117 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 89 insertions(+), 28 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 184b0870af1f3..818e34776ea4f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -520,18 +520,29 @@ enum llm_tensor { LLM_TENSOR_SSM_A, LLM_TENSOR_SSM_D, LLM_TENSOR_SSM_OUT, + LLM_TENSOR_TIME_MIX_W1, + LLM_TENSOR_TIME_MIX_W2, + LLM_TENSOR_TIME_MIX_LERP_X, + LLM_TENSOR_TIME_MIX_LERP_W, LLM_TENSOR_TIME_MIX_LERP_K, LLM_TENSOR_TIME_MIX_LERP_V, LLM_TENSOR_TIME_MIX_LERP_R, LLM_TENSOR_TIME_MIX_LERP_G, LLM_TENSOR_TIME_MIX_FIRST, LLM_TENSOR_TIME_MIX_DECAY, + LLM_TENSOR_TIME_MIX_DECAY_W1, + LLM_TENSOR_TIME_MIX_DECAY_W2, LLM_TENSOR_TIME_MIX_KEY, LLM_TENSOR_TIME_MIX_VALUE, LLM_TENSOR_TIME_MIX_RECEPTANCE, LLM_TENSOR_TIME_MIX_GATE, LLM_TENSOR_TIME_MIX_LN, LLM_TENSOR_TIME_MIX_OUTPUT, + LLM_TENSOR_CHANNEL_MIX_LERP_K, + LLM_TENSOR_CHANNEL_MIX_LERP_R, + LLM_TENSOR_CHANNEL_MIX_KEY, + LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, + LLM_TENSOR_CHANNEL_MIX_VALUE, LLM_TENSOR_ATTN_Q_A, LLM_TENSOR_ATTN_Q_B, LLM_TENSOR_ATTN_KV_A_MQA, @@ -1356,24 +1367,35 @@ static const std::map> LLM_TENSOR_NA { LLM_ARCH_RWKV, { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, - { LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix.lerp_k" }, - { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix.lerp_v" }, - { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix.lerp_r" }, - { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix.lerp_g" }, - { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix.first" }, - { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix.decay" }, - { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix.key" }, - { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix.value" }, - { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix.receptance" }, - { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix.gate" }, - { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix.ln" }, - { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix.output" }, + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, + { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" }, + { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" }, + { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" }, + { LLM_TENSOR_TIME_MIX_LERP_W, "blk.%d.time_mix_lerp_w" }, + { LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix_lerp_k" }, + { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" }, + { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" }, + { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" }, + { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" }, + { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" }, + { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" }, + { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" }, + { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" }, + { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" }, + { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" }, + { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" }, + { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" }, + { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" }, + { LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" }, + { LLM_TENSOR_CHANNEL_MIX_LERP_R, "blk.%d.channel_mix_lerp_r" }, + { LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" }, + { LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" }, + { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" }, }, }, { @@ -2539,6 +2561,10 @@ struct llama_layer { struct ggml_tensor * ssm_dt_b; // rwkv + struct ggml_tensor * time_mix_w1; + struct ggml_tensor * time_mix_w2; + struct ggml_tensor * time_mix_lerp_x; + struct ggml_tensor * time_mix_lerp_w; struct ggml_tensor * time_mix_lerp_k; struct ggml_tensor * time_mix_lerp_v; struct ggml_tensor * time_mix_lerp_r; @@ -2546,6 +2572,8 @@ struct llama_layer { struct ggml_tensor * time_mix_first; struct ggml_tensor * time_mix_decay; + struct ggml_tensor * time_mix_decay_w1; + struct ggml_tensor * time_mix_decay_w2; struct ggml_tensor * time_mix_key; struct ggml_tensor * time_mix_value; struct ggml_tensor * time_mix_receptance; @@ -2555,6 +2583,13 @@ struct llama_layer { struct ggml_tensor * time_mix_ln_b; struct ggml_tensor * time_mix_output; + struct ggml_tensor * channel_mix_lerp_k; + struct ggml_tensor * channel_mix_lerp_r; + + struct ggml_tensor * channel_mix_key; + struct ggml_tensor * channel_mix_receptance; + struct ggml_tensor * channel_mix_value; + // long rope factors struct ggml_tensor * rope_long = nullptr; struct ggml_tensor * rope_short = nullptr; @@ -5148,6 +5183,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ case LLAMA_VOCAB_TYPE_BPE: return "BPE"; case LLAMA_VOCAB_TYPE_WPM: return "WPM"; case LLAMA_VOCAB_TYPE_UGM: return "UGM"; + case LLAMA_VOCAB_TYPE_RWKV: return "RWKV"; default: return "unknown"; } } @@ -6125,7 +6161,7 @@ static void llm_load_vocab( vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.tokenizer_add_space_prefix = false; vocab.tokenizer_clean_spaces = false; - vocab.tokenizer_add_bos = true; + vocab.tokenizer_add_bos = false; vocab.tokenizer_add_eos = false; } else { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; @@ -6231,6 +6267,10 @@ static void llm_load_vocab( } } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { vocab.linefeed_id = vocab.special_pad_id; + } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) { + const std::vector ids = llama_tokenize_internal(vocab, "\n", false); + GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); + vocab.linefeed_id = ids[0]; } else { const std::vector ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); @@ -8288,7 +8328,14 @@ static bool llm_load_tensors( // output model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + + // TODO: Parameterize this + const int time_mix_extra_dim = 32; + const int time_decay_extra_dim = 64; + const int head_size = 64; + const int attn_hidden_size = n_embd; + const int ffn_size = (int)(n_embd * 3.5 / 32) * 32; for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); @@ -8301,25 +8348,39 @@ static bool llm_load_tensors( layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); + layer.time_mix_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {time_mix_extra_dim * 5, n_embd}); + layer.time_mix_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_embd, time_mix_extra_dim, 5}); + + layer.time_mix_lerp_x = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}); + layer.time_mix_lerp_w = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}); layer.time_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}); layer.time_mix_lerp_v = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}); layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}); layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}); // TODO: Parametrize hardcoded dimensions for first & decay - layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {64, 32}); - layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {64, 32}); - layer.time_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, n_embd}); - layer.time_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, n_embd}); - layer.time_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}); - layer.time_mix_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {n_embd, n_embd}); + layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}); + layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}); + layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {time_decay_extra_dim, n_embd}); + layer.time_mix_decay_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {attn_hidden_size, time_decay_extra_dim}); + layer.time_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}); + layer.time_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}); + layer.time_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}); + layer.time_mix_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}); layer.time_mix_ln = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}); layer.time_mix_ln_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}); - layer.time_mix_output = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, n_embd}); + layer.time_mix_output = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}); + + layer.channel_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}); + layer.channel_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}); + + layer.channel_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}); + layer.channel_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}); + layer.channel_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}); } - } + } break; default: throw std::runtime_error("unknown architecture"); } From 0e5ac349f8827190c874f5a46972e14a43588099 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Fri, 2 Aug 2024 12:04:36 +0800 Subject: [PATCH 16/53] Fix rwkv tokenizer Signed-off-by: Molly Sophia --- src/llama-vocab.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 0c9e57215254b..9be52d7372c21 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1171,7 +1171,7 @@ struct llm_tokenizer_rwkv { // Iterate through possible tokens backwards, starting with the largest for (int32_t i = (int32_t)tokens.size() - 1; i >= 0; i--) { // Skip tokens that aren't normal type, we can't match on those - if (vocab.id_to_token[i].attr != LLAMA_TOKEN_TYPE_NORMAL) { + if (!(vocab.id_to_token[i].attr & LLAMA_TOKEN_ATTR_NORMAL)) { continue; } From 5732de89b75cdfe5cd2c312356f84f9ce4ffa75d Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Fri, 2 Aug 2024 16:29:16 +0800 Subject: [PATCH 17/53] ggml: Add unary operator Exp Signed-off-by: Molly Sophia --- ggml/include/ggml.h | 9 +++++++ ggml/src/ggml.c | 64 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index b11d047aeda7d..8ea652dc8f436 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -546,6 +546,7 @@ extern "C" { GGML_UNARY_OP_SILU, GGML_UNARY_OP_HARDSWISH, GGML_UNARY_OP_HARDSIGMOID, + GGML_UNARY_OP_EXP, GGML_UNARY_OP_COUNT, }; @@ -1139,6 +1140,14 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_exp( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_exp_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + // normalize along rows GGML_API struct ggml_tensor * ggml_norm( struct ggml_context * ctx, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 9c105fd353de4..f7d016dadae23 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2324,6 +2324,7 @@ inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x // TODO: optimize performance inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } +inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); } static const float GELU_COEF_A = 0.044715f; static const float GELU_QUICK_COEF = -1.702f; @@ -2963,9 +2964,10 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = { "SILU", "HARDSWISH", "HARDSIGMOID", + "EXP", }; -static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13"); +static_assert(GGML_UNARY_OP_COUNT == 14, "GGML_UNARY_OP_COUNT != 14"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); @@ -5359,6 +5361,19 @@ struct ggml_tensor * ggml_hardsigmoid( return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID); } +// ggml exp +struct ggml_tensor * ggml_exp( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_EXP); +} + +struct ggml_tensor * ggml_exp_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_EXP); +} + // ggml_norm static struct ggml_tensor * ggml_norm_impl( @@ -12021,6 +12036,48 @@ static void ggml_compute_forward_hardsigmoid( } } +static void ggml_compute_forward_exp_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + ggml_vec_exp_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_exp( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_exp_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_norm @@ -16599,6 +16656,10 @@ static void ggml_compute_forward_unary( { ggml_compute_forward_hardsigmoid(params, dst); } break; + case GGML_UNARY_OP_EXP: + { + ggml_compute_forward_exp(params, dst); + } break; default: { GGML_ABORT("fatal error"); @@ -18990,6 +19051,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_UNARY_OP_SIGMOID: case GGML_UNARY_OP_HARDSWISH: case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_EXP: { n_tasks = 1; } break; From 0784a0cf2634b673bc0a47c1b08f3e8e9f630825 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Fri, 2 Aug 2024 13:58:34 +0800 Subject: [PATCH 18/53] RWKV v6 graph building Signed-off-by: Molly Sophia --- ggml/include/ggml.h | 10 ++ ggml/src/ggml.c | 149 ++++++++++++++++++++++- src/llama.cpp | 291 ++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 424 insertions(+), 26 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 8ea652dc8f436..39aff9e39a68a 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -512,6 +512,7 @@ extern "C" { GGML_OP_WIN_UNPART, GGML_OP_GET_REL_POS, GGML_OP_ADD_REL_POS, + GGML_OP_RWKV_WKV, GGML_OP_UNARY, @@ -1896,6 +1897,15 @@ extern "C" { struct ggml_tensor * pw, struct ggml_tensor * ph); + GGML_API struct ggml_tensor * ggml_rwkv_wkv( + struct ggml_context * ctx, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * r, + struct ggml_tensor * tf, + struct ggml_tensor * td, + struct ggml_tensor * state); + // custom operators typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f7d016dadae23..78f1aa3b28b95 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2835,6 +2835,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "WIN_UNPART", "GET_REL_POS", "ADD_REL_POS", + "RWKV_WKV", "UNARY", @@ -2853,7 +2854,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 78, "GGML_OP_COUNT != 78"); +static_assert(GGML_OP_COUNT == 79, "GGML_OP_COUNT != 79"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -2927,6 +2928,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "win_unpart(x)", "get_rel_pos(x)", "add_rel_pos(x)", + "rwkv_wkv(x, k, v, r, tf, td, s)", "unary(x)", @@ -2945,7 +2947,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 78, "GGML_OP_COUNT != 78"); +static_assert(GGML_OP_COUNT == 79, "GGML_OP_COUNT != 79"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -7637,6 +7639,57 @@ struct ggml_tensor * ggml_add_rel_pos_inplace( return ggml_add_rel_pos_impl(ctx, a, pw, ph, true); } +// ggml_rwkv_wkv + +struct ggml_tensor * ggml_rwkv_wkv( + struct ggml_context * ctx, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * r, + struct ggml_tensor * tf, + struct ggml_tensor * td, + struct ggml_tensor * state) { + GGML_ASSERT(ggml_is_contiguous(k)); + GGML_ASSERT(ggml_is_contiguous(v)); + GGML_ASSERT(ggml_is_contiguous(r)); + GGML_ASSERT(ggml_is_contiguous(tf)); + GGML_ASSERT(ggml_is_contiguous(td)); + GGML_ASSERT(ggml_is_contiguous(state)); + + const int64_t S = k->ne[0]; + const int64_t H = k->ne[2]; + const int64_t n_tokens = k->ne[3]; + { + GGML_ASSERT(k->ne[1] == 1); + GGML_ASSERT(v->ne[0] == 1 && v->ne[1] == S && v->ne[2] == H && v->ne[3] == n_tokens); + GGML_ASSERT(r->ne[0] == 1 && r->ne[1] == S && r->ne[2] == H && r->ne[3] == n_tokens); + // TODO: RWKV v4 and v5 + GGML_ASSERT(td->ne[0] == 1 && td->ne[1] == S && td->ne[2] == H && td->ne[3] == n_tokens); + GGML_ASSERT(ggml_nelements(state) == S * S * H); + } + + bool is_node = false; + + if (k->grad || v->grad || r->grad || tf->grad || td->grad || state->grad) { + GGML_ABORT("fatal error"); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { S * H, n_tokens, 1, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_RWKV_WKV; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = k; + result->src[1] = v; + result->src[2] = r; + result->src[3] = tf; + result->src[4] = td; + result->src[5] = state; + + return result; +} + // ggml_unary static struct ggml_tensor * ggml_unary_impl( @@ -16795,6 +16848,92 @@ static void ggml_compute_forward_add_rel_pos( } } +// ggml_compute_forward_rwkv_wkv + +static void ggml_compute_forward_rwkv_wkv_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + const size_t T = dst->ne[1]; + const size_t C = dst->ne[0]; + const size_t H = dst->src[1]->ne[2]; + + float * dst_data = (float *) dst->data; + + if (params->ith != 0) { + return; + } + + memset(dst_data, 0, T * C * sizeof(float)); + + float * k = (float *) dst->src[0]->data; + float * v = (float *) dst->src[1]->data; + float * r = (float *) dst->src[2]->data; + float * time_faaaa = (float *) dst->src[3]->data; + float * time_decay = (float *) dst->src[4]->data; + float * state = (float *) dst->src[5]->data; + + size_t t_stride = H * (C / H); + + size_t h_stride = C / H; + size_t h_stride_2d = (C / H) * (C / H); + + // basically fused operations: + // dst = r @ (time_faaaa * (k @ v) + state), + // state = time_decay * state + (k @ v), + // recursive through each token + for (size_t t = 0; t < T; t++) { + size_t t_offset = t * t_stride; + + for (size_t h = 0; h < H; h++) { + size_t h_offset = h * h_stride; + size_t t_h_offset = t_offset + h_offset; + size_t h_2d_offset = h * h_stride_2d; + + for (size_t i = 0; i < C / H; i++) { + size_t t_h_i_offset = t_h_offset + i; + size_t h_i_offset = h_offset + i; + size_t h_2d_i_offset = h_2d_offset + i * h_stride; + + float k_val = k[t_h_i_offset]; + float r_val = r[t_h_i_offset]; + float time_faaaa_val = time_faaaa[h_i_offset]; + // RWKV v6: different time_decay for each token. + float time_decay_val = time_decay[t_h_i_offset]; + + for (size_t j = 0; j < C / H; j ++) { + size_t t_h_j_offset = t_h_offset + j; + size_t h_2d_i_j_offset = h_2d_i_offset + j; + + float v_val = v[t_h_j_offset]; + float kv_val = v_val * k_val; + float prev_state_val = state[h_2d_i_j_offset]; + float temp_val = kv_val * time_faaaa_val + prev_state_val; + dst_data[t_h_j_offset] += temp_val * r_val; + state[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val; + } + } + } + } +} + +static void ggml_compute_forward_rwkv_wkv( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_rwkv_wkv_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_map_unary static void ggml_compute_forward_map_unary_f32( @@ -17446,6 +17585,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_add_rel_pos(params, tensor); } break; + case GGML_OP_RWKV_WKV: + { + ggml_compute_forward_rwkv_wkv(params, tensor); + } break; case GGML_OP_MAP_UNARY: { ggml_unary_op_f32_t fun; @@ -18569,6 +18712,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_GET_REL_POS: case GGML_OP_ADD_REL_POS: + case GGML_OP_RWKV_WKV: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: case GGML_OP_MAP_CUSTOM1_F32: @@ -19143,6 +19287,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_WIN_PART: case GGML_OP_WIN_UNPART: case GGML_OP_GET_REL_POS: + case GGML_OP_RWKV_WKV: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: case GGML_OP_MAP_CUSTOM1_F32: diff --git a/src/llama.cpp b/src/llama.cpp index 818e34776ea4f..c43776acd4d4d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3512,7 +3512,7 @@ static bool llama_kv_cache_find_slot( const uint32_t n_seq_tokens = batch.n_seq_tokens; if (cache.recurrent) { - // For recurrent state architectures (like Mamba), + // For recurrent state architectures (like Mamba or RWKV), // each cache cell can store the state for a whole sequence. // A slot should be always be contiguous. @@ -3761,7 +3761,7 @@ static bool llama_kv_cache_seq_rm( if (p0 < 0) p0 = 0; if (p1 < 0) p1 = std::numeric_limits::max(); - // models like Mamba can't have a state partially erased + // models like Mamba or RWKV can't have a state partially erased if (cache.recurrent) { if (seq_id >= (int64_t) cache.size) { // could be fatal @@ -3897,7 +3897,7 @@ static void llama_kv_cache_seq_add( if (p0 == p1) return; if (cache.recurrent) { - // for Mamba-like models, only the pos needs to be shifted + // for Mamba-like or RWKV models, only the pos needs to be shifted if (0 <= seq_id && seq_id < (int64_t) cache.size) { const int32_t tail_id = cache.cells[seq_id].tail; if (tail_id >= 0) { @@ -3946,7 +3946,7 @@ static void llama_kv_cache_seq_div( if (p0 == p1) return; if (cache.recurrent) { - // for Mamba-like models, only the pos needs to be changed + // for Mamba-like or RWKV models, only the pos needs to be changed if (0 <= seq_id && seq_id < (int64_t) cache.size) { const int32_t tail_id = cache.cells[seq_id].tail; if (tail_id >= 0) { @@ -5885,8 +5885,9 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); // TODO: Re-using mamba keys right now, but RWKV isn't state-space - ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); - ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); } break; default: (void)0; } @@ -8323,7 +8324,7 @@ static bool llm_load_tensors( // Block 0, LN0 model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); - model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); + model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); // output model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); @@ -8348,8 +8349,8 @@ static bool llm_load_tensors( layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); - layer.time_mix_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {time_mix_extra_dim * 5, n_embd}); - layer.time_mix_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_embd, time_mix_extra_dim, 5}); + layer.time_mix_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}); + layer.time_mix_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}); layer.time_mix_lerp_x = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}); layer.time_mix_lerp_w = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}); @@ -8361,8 +8362,8 @@ static bool llm_load_tensors( // TODO: Parametrize hardcoded dimensions for first & decay layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}); layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}); - layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {time_decay_extra_dim, n_embd}); - layer.time_mix_decay_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {attn_hidden_size, time_decay_extra_dim}); + layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}); + layer.time_mix_decay_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}); layer.time_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}); layer.time_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}); layer.time_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}); @@ -9344,9 +9345,198 @@ static struct ggml_tensor * llm_build_time_mix( struct ggml_context * ctx, const struct llama_layer * layer, struct ggml_tensor * current, - int layer_i) { + struct ggml_tensor * x_prev, + struct ggml_tensor * wkv_state) { + size_t n_embed = current->ne[0]; + size_t n_tokens = current->ne[1]; + size_t head_size = layer->time_mix_first->ne[0]; + size_t head_count = layer->time_mix_first->ne[1]; + + struct ggml_tensor * sx = ggml_sub(ctx, x_prev, current); + struct ggml_tensor * xxx = ggml_add_inplace( + ctx, + ggml_mul(ctx, sx, layer->time_mix_lerp_x), + current + ); + + xxx = ggml_reshape_4d( + ctx, + ggml_tanh_inplace( + ctx, + ggml_mul_mat(ctx, layer->time_mix_w1, xxx) + ), + layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens + ); + + xxx = ggml_cont( + ctx, + ggml_permute(ctx, xxx, 0, 1, 3, 2) + ); + + xxx = ggml_mul_mat( + ctx, + ggml_reshape_4d( + ctx, + layer->time_mix_w2, + layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 + ), + xxx + ); + + struct ggml_tensor *mw = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); + mw = ggml_reshape_2d( + ctx, + ggml_set_1d(ctx, mw, ggml_view_1d(ctx, xxx, n_embed * n_tokens, 0), 0), + n_embed, n_tokens + ); + + struct ggml_tensor *mk = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); + mk = ggml_reshape_2d( + ctx, + ggml_set_1d_inplace(ctx, mk, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * sizeof(float)), 0), + n_embed, n_tokens + ); - return current; + struct ggml_tensor *mv = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); + mv = ggml_reshape_2d( + ctx, + ggml_set_1d_inplace(ctx, mv, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 2 * sizeof(float)), 0), + n_embed, n_tokens + ); + + struct ggml_tensor *mr = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); + mr = ggml_reshape_2d( + ctx, + ggml_set_1d_inplace(ctx, mr, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 3 * sizeof(float)), 0), + n_embed, n_tokens + ); + + struct ggml_tensor *mg = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); + mg = ggml_reshape_2d( + ctx, + ggml_set_1d_inplace(ctx, mg, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 4 * sizeof(float)), 0), + n_embed, n_tokens + ); + + struct ggml_tensor * xw = ggml_add_inplace( + ctx, + ggml_mul_inplace( + ctx, + ggml_add(ctx, mw, layer->time_mix_lerp_w), + sx + ), + current + ); + + struct ggml_tensor * xk = ggml_add_inplace( + ctx, + ggml_mul_inplace( + ctx, + ggml_add(ctx, mk, layer->time_mix_lerp_k), + sx + ), + current + ); + + struct ggml_tensor * xv = ggml_add_inplace( + ctx, + ggml_mul_inplace( + ctx, + ggml_add(ctx, mv, layer->time_mix_lerp_v), + sx + ), + current + ); + + struct ggml_tensor * xr = ggml_add_inplace( + ctx, + ggml_mul_inplace( + ctx, + ggml_add(ctx, mr, layer->time_mix_lerp_r), + sx + ), + current + ); + + struct ggml_tensor * xg = ggml_add_inplace( + ctx, + ggml_mul_inplace( + ctx, + ggml_add(ctx, mg, layer->time_mix_lerp_g), + sx + ), + current + ); + + struct ggml_tensor * r = ggml_reshape_4d(ctx, ggml_mul_mat(ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens); + struct ggml_tensor * k = ggml_reshape_4d(ctx, ggml_mul_mat(ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens); + struct ggml_tensor * v = ggml_reshape_4d(ctx, ggml_mul_mat(ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens); + struct ggml_tensor * g = ggml_silu_inplace( + ctx, + ggml_mul_mat(ctx, layer->time_mix_gate, xg) + ); + + struct ggml_tensor * w = ggml_mul_mat( + ctx, + layer->time_mix_decay_w2, + ggml_tanh_inplace( + ctx, + ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw) + ) + ); + w = ggml_add_inplace( + ctx, + w, + ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed) + ); + w = ggml_exp(ctx, ggml_neg_inplace(ctx, ggml_exp(ctx, w))); + w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens); + + k = ggml_transpose(ctx, k); + v = ggml_transpose(ctx, v); + r = ggml_transpose(ctx, r); + current = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, wkv_state); + + // ggml_group_norm considers groups in the third dimension. + current = ggml_reshape_4d(ctx, current, 1, 1, n_embed, n_tokens); + current = ggml_group_norm(ctx, current, head_count, 64e-5f); + // Convert back to a regular vector. + current = ggml_reshape_2d(ctx, current, n_embed, n_tokens); + current = ggml_add_inplace( + ctx, + ggml_mul_inplace( + ctx, + current, + layer->time_mix_ln + ), + layer->time_mix_ln_b + ); + + current = ggml_mul(ctx, current, g); + + return ggml_mul_mat(ctx, layer->time_mix_output, current); +} + +static struct ggml_tensor * llm_build_channel_mix( + struct ggml_context * ctx, + const struct llama_layer * layer, + struct ggml_tensor * current, + struct ggml_tensor * x_prev) { + + struct ggml_tensor * sx = ggml_sub(ctx, x_prev, current); + struct ggml_tensor * xk = ggml_add_inplace( + ctx, + ggml_mul(ctx, sx, layer->channel_mix_lerp_k), + current + ); + struct ggml_tensor * xr = ggml_add_inplace( + ctx, + ggml_mul(ctx, sx, layer->channel_mix_lerp_r), + current + ); + struct ggml_tensor * r = ggml_sigmoid_inplace(ctx, ggml_mul_mat(ctx, layer->channel_mix_receptance, xr)); + struct ggml_tensor * k = ggml_sqr_inplace(ctx, ggml_relu_inplace(ctx, ggml_mul_mat(ctx, layer->channel_mix_key, xk))); + return ggml_mul_inplace(ctx, r, ggml_mul_mat(ctx, layer->channel_mix_value, k)); } struct llm_build_context { @@ -14874,32 +15064,85 @@ struct llm_build_context { ggml_cgraph * build_rwkv() { ggml_cgraph *gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + // Token shift state dimensions should be 2 * n_emb + GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); + // Input embeddings, start of the model after tokenizing ({n_embd, n_tokens}) ggml_tensor * input_embeddings = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - // x = self.layer_norm(x, self.w.blocks[0].ln0) - ggml_tensor * current = llm_build_norm(ctx0, input_embeddings, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); + ggml_tensor * x = llm_build_norm(ctx0, input_embeddings, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); for (int layer_i = 0; layer_i < n_layer; ++layer_i) { const llama_layer * layer = &model.layers[layer_i]; - current = llm_build_norm(ctx0, current, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, layer_i); - current = llm_build_time_mix(ctx0, layer, current, layer_i); - - current = llm_build_norm(ctx0, current, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i); + // TODO: handle multiple kv cache cells + struct ggml_tensor * wkv_state = ggml_view_1d(ctx0, kv_self.v_l[layer_i], hparams.n_embd_v_s(), (kv_self.size - 1) * hparams.n_embd_v_s() * ggml_type_size(kv_self.k_l[layer_i]->type)); + struct ggml_tensor * att_shift = ggml_view_1d(ctx0, kv_self.k_l[layer_i], n_embd, (kv_self.size - 1) * 2 * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type)); + struct ggml_tensor * ffn_shift = ggml_view_1d(ctx0, kv_self.k_l[layer_i], n_embd, ((kv_self.size - 1) * 2 + 1) * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type)); + + struct ggml_tensor * x_norm = llm_build_norm(ctx0, x, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, layer_i); + struct ggml_tensor * x_prev = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + x_prev = ggml_set_1d(ctx0, x_prev, att_shift, 0); + x_prev = ggml_set_1d( + ctx0, + x_prev, + ggml_view_1d(ctx0, x_norm, (n_tokens - 1) * n_embd, 0), + n_embd * ggml_type_size(x_prev->type) + ); + + x = ggml_add(ctx0, x, llm_build_time_mix(ctx0, layer, x_norm, x_prev, wkv_state)); + ggml_build_forward_expand(gf, x); + ggml_build_forward_expand( + gf, + ggml_cpy( + ctx0, + ggml_view_1d( + ctx0, + x_norm, + n_embd, + (n_tokens - 1) * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type) + ), + att_shift + ) + ); + + x_norm = llm_build_norm(ctx0, x, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i); + x_prev = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + x_prev = ggml_set_1d(ctx0, x_prev, ffn_shift, 0); + x_prev = ggml_set_1d( + ctx0, + x_prev, + ggml_view_1d(ctx0, x_norm, (n_tokens - 1) * n_embd, 0), + n_embd * ggml_type_size(x_prev->type) + ); + x = ggml_add(ctx0, x, llm_build_channel_mix(ctx0, layer, x_norm, x_prev)); + ggml_build_forward_expand(gf, x); + ggml_build_forward_expand( + gf, + ggml_cpy( + ctx0, + ggml_view_1d( + ctx0, + x_norm, + n_embd, + (n_tokens - 1) * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type) + ), + ffn_shift + ) + ); } // Something related to skipping tokens, specifics unclear ggml_tensor * inp_out_ids = build_inp_out_ids(); - current = ggml_get_rows(ctx0, current, inp_out_ids); + x = ggml_get_rows(ctx0, x, inp_out_ids); // Output head, convert result vector to logits - current = llm_build_norm(ctx0, current, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); - current = ggml_mul_mat(ctx0, model.output, current); + x = llm_build_norm(ctx0, x, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); + x = ggml_mul_mat(ctx0, model.output, x); // Mark the output as being the result - cb(current, "result_output", -1); - ggml_build_forward_expand(gf, current); + cb(x, "result_output", -1); + ggml_build_forward_expand(gf, x); return gf; } From 8d498c7075f6e304b2f3e688f3a706ec11d533dc Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Tue, 6 Aug 2024 18:53:27 +0800 Subject: [PATCH 19/53] Add ``rescale_every_n_layers`` parameter Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 4 ++-- gguf-py/gguf/constants.py | 1 + gguf-py/gguf/gguf_writer.py | 3 +++ src/llama.cpp | 11 ++++++++++- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d109857a20216..65165b76414b1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2754,6 +2754,7 @@ def set_gguf_parameters(self): head_size = self.hparams["head_size"] hidden_size = self.hparams["hidden_size"] layer_norm_eps = self.hparams["layer_norm_epsilon"] + rescale_every_n_layers = self.hparams["rescale_every"] # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) @@ -2762,14 +2763,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count(0) self.gguf_writer.add_layer_norm_eps(layer_norm_eps) self.gguf_writer.add_feed_forward_length(0) # required by llama.cpp + self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) # temporarlily reuse mamba hparams self.gguf_writer.add_ssm_inner_size(hidden_size) self.gguf_writer.add_ssm_conv_kernel(3) self.gguf_writer.add_ssm_state_size(head_size) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - new_name = self.map_tensor_name(name) if not (new_name.endswith(".weight") or new_name.endswith(".bias")): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b6f29ba9ee9ad..a6883b3925097 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -94,6 +94,7 @@ class LLM: DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" + RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" class Attention: HEAD_COUNT = "{arch}.attention.head_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index af3b98c679b0b..6bc3782c37c79 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -670,6 +670,9 @@ def add_expert_shared_count(self, count: int) -> None: def add_expert_weights_scale(self, value: float) -> None: self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value) + def add_rescale_every_n_layers(self, count: int) -> None: + self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count) + def add_layer_norm_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) diff --git a/src/llama.cpp b/src/llama.cpp index c43776acd4d4d..bfc292f59aec7 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -297,6 +297,7 @@ enum llm_kv { LLM_KV_DECODER_START_TOKEN_ID, LLM_KV_ATTN_LOGIT_SOFTCAPPING, LLM_KV_FINAL_LOGIT_SOFTCAPPING, + LLM_KV_RESCALE_EVERY_N_LAYERS, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -391,11 +392,12 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" }, { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, - { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, + { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, + { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -2287,6 +2289,9 @@ struct llama_hparams { float f_attn_logit_softcapping = 50.0f; float f_final_logit_softcapping = 30.0f; + // for RWKV + uint32_t rescale_every_n_layers = 0; + float rope_attn_factor = 1.0f; float rope_freq_base_train; float rope_freq_scale_train; @@ -5883,6 +5888,7 @@ static void llm_load_hparams( case LLM_ARCH_RWKV: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); // TODO: Re-using mamba keys right now, but RWKV isn't state-space ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); @@ -15130,6 +15136,9 @@ struct llm_build_context { ffn_shift ) ); + if ((layer_i + 1) % hparams.rescale_every_n_layers == 0) { + x = ggml_scale_inplace(ctx0, x, 0.5F); + } } // Something related to skipping tokens, specifics unclear From 903089b5eb04998b1a0948fb1e0c769d20c48223 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Wed, 7 Aug 2024 10:35:40 +0800 Subject: [PATCH 20/53] Add ``wkv.head_size`` key for RWKV so it doesn't reuse Mamba ssm parameters Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 11 +++++------ gguf-py/gguf/constants.py | 3 +++ gguf-py/gguf/gguf_writer.py | 3 +++ src/llama.cpp | 36 +++++++++++++++++++++++++----------- 4 files changed, 36 insertions(+), 17 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 65165b76414b1..a9cfa9ffb0091 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2760,14 +2760,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_context_length(1048576) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(0) self.gguf_writer.add_layer_norm_eps(layer_norm_eps) - self.gguf_writer.add_feed_forward_length(0) # required by llama.cpp self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) - # temporarlily reuse mamba hparams - self.gguf_writer.add_ssm_inner_size(hidden_size) - self.gguf_writer.add_ssm_conv_kernel(3) - self.gguf_writer.add_ssm_state_size(head_size) + self.gguf_writer.add_wkv_head_size(head_size) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + self.gguf_writer.add_feed_forward_length(0) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: new_name = self.map_tensor_name(name) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a6883b3925097..32b9024804082 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -133,6 +133,9 @@ class SSM: TIME_STEP_RANK = "{arch}.ssm.time_step_rank" DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" + class WKV: + HEAD_SIZE = "{arch}.wkv.head_size" + class Tokenizer: MODEL = "tokenizer.ggml.model" PRE = "tokenizer.ggml.pre" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 6bc3782c37c79..0388db567d8bd 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -673,6 +673,9 @@ def add_expert_weights_scale(self, value: float) -> None: def add_rescale_every_n_layers(self, count: int) -> None: self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count) + def add_wkv_head_size(self, size: int) -> None: + self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) + def add_layer_norm_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) diff --git a/src/llama.cpp b/src/llama.cpp index bfc292f59aec7..c755e728fe757 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -333,6 +333,8 @@ enum llm_kv { LLM_KV_SSM_TIME_STEP_RANK, LLM_KV_SSM_DT_B_C_RMS, + LLM_KV_WKV_HEAD_SIZE, + LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_PRE, LLM_KV_TOKENIZER_LIST, @@ -433,6 +435,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, + { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, @@ -2291,6 +2295,7 @@ struct llama_hparams { // for RWKV uint32_t rescale_every_n_layers = 0; + uint32_t wkv_head_size = 0; float rope_attn_factor = 1.0f; float rope_freq_base_train; @@ -2355,6 +2360,9 @@ struct llama_hparams { if (this->ssm_dt_rank != other.ssm_dt_rank) return true; if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true; + if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true; + if (this->wkv_head_size != other.wkv_head_size) return true; + if (this->dec_start_token_id != other.dec_start_token_id) return true; const float EPSILON = 1e-9f; @@ -2418,15 +2426,25 @@ struct llama_hparams { } uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings - // corresponds to Mamba's conv_states size - // TODO: maybe support other convolution strides than 1 - // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed - return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner; + // corresponds to Mamba's conv_states size or RWKV's token_shift states size + if (wkv_head_size != 0) { + // for RWKV models + return 2 * n_embd; + } else { + // TODO: maybe support other convolution strides than 1 + // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed + return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner; + } } uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings - // corresponds to Mamba's ssm_states size - return ssm_d_state * ssm_d_inner; + if (wkv_head_size != 0) { + // corresponds to RWKV's wkv_states size + return n_embd * wkv_head_size; + } else { + // corresponds to Mamba's ssm_states size + return ssm_d_state * ssm_d_inner; + } } }; @@ -5888,12 +5906,8 @@ static void llm_load_hparams( case LLM_ARCH_RWKV: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size); ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); - - // TODO: Re-using mamba keys right now, but RWKV isn't state-space - ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); - ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); - ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); } break; default: (void)0; } From 98ce5f43f0e0ce10ccf0d6cd85aca8f9757f86a7 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Wed, 7 Aug 2024 16:40:41 +0800 Subject: [PATCH 21/53] Fix offloading layers to CUDA Signed-off-by: Molly Sophia --- ggml/src/ggml.c | 8 +++++--- src/llama.cpp | 22 ++++++++++++++++------ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 78f1aa3b28b95..17e92eff2020b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7675,7 +7675,8 @@ struct ggml_tensor * ggml_rwkv_wkv( is_node = true; } - const int64_t ne[4] = { S * H, n_tokens, 1, 1 }; + // concat output and new_state + const int64_t ne[4] = { S * H, n_tokens + S, 1, 1 }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_RWKV_WKV; @@ -16853,11 +16854,12 @@ static void ggml_compute_forward_add_rel_pos( static void ggml_compute_forward_rwkv_wkv_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - const size_t T = dst->ne[1]; + const size_t T = dst->src[1]->ne[3]; const size_t C = dst->ne[0]; const size_t H = dst->src[1]->ne[2]; float * dst_data = (float *) dst->data; + float * state = ((float *) dst->data) + C * T; if (params->ith != 0) { return; @@ -16870,7 +16872,7 @@ static void ggml_compute_forward_rwkv_wkv_f32( float * r = (float *) dst->src[2]->data; float * time_faaaa = (float *) dst->src[3]->data; float * time_decay = (float *) dst->src[4]->data; - float * state = (float *) dst->src[5]->data; + memcpy(state, dst->src[5]->data, (C / H) * C * sizeof(float)); size_t t_stride = H * (C / H); diff --git a/src/llama.cpp b/src/llama.cpp index c755e728fe757..5e474d61d3550 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9366,7 +9366,7 @@ static struct ggml_tensor * llm_build_time_mix( const struct llama_layer * layer, struct ggml_tensor * current, struct ggml_tensor * x_prev, - struct ggml_tensor * wkv_state) { + struct ggml_tensor ** wkv_state) { size_t n_embed = current->ne[0]; size_t n_tokens = current->ne[1]; size_t head_size = layer->time_mix_first->ne[0]; @@ -9509,13 +9509,15 @@ static struct ggml_tensor * llm_build_time_mix( w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed) ); - w = ggml_exp(ctx, ggml_neg_inplace(ctx, ggml_exp(ctx, w))); + w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w))); w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens); k = ggml_transpose(ctx, k); v = ggml_transpose(ctx, v); r = ggml_transpose(ctx, r); - current = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, wkv_state); + struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); + current = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0); + *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size, n_embed * n_tokens * sizeof(float)); // ggml_group_norm considers groups in the third dimension. current = ggml_reshape_4d(ctx, current, 1, 1, n_embed, n_tokens); @@ -15096,7 +15098,7 @@ struct llm_build_context { const llama_layer * layer = &model.layers[layer_i]; // TODO: handle multiple kv cache cells - struct ggml_tensor * wkv_state = ggml_view_1d(ctx0, kv_self.v_l[layer_i], hparams.n_embd_v_s(), (kv_self.size - 1) * hparams.n_embd_v_s() * ggml_type_size(kv_self.k_l[layer_i]->type)); + struct ggml_tensor * wkv_state = ggml_view_1d(ctx0, kv_self.v_l[layer_i], hparams.n_embd_v_s(), (kv_self.size - 1) * hparams.n_embd_v_s() * ggml_type_size(kv_self.k_l[layer_i]->type)); struct ggml_tensor * att_shift = ggml_view_1d(ctx0, kv_self.k_l[layer_i], n_embd, (kv_self.size - 1) * 2 * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type)); struct ggml_tensor * ffn_shift = ggml_view_1d(ctx0, kv_self.k_l[layer_i], n_embd, ((kv_self.size - 1) * 2 + 1) * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type)); @@ -15110,7 +15112,7 @@ struct llm_build_context { n_embd * ggml_type_size(x_prev->type) ); - x = ggml_add(ctx0, x, llm_build_time_mix(ctx0, layer, x_norm, x_prev, wkv_state)); + x = ggml_add(ctx0, x, llm_build_time_mix(ctx0, layer, x_norm, x_prev, &wkv_state)); ggml_build_forward_expand(gf, x); ggml_build_forward_expand( gf, @@ -15125,6 +15127,14 @@ struct llm_build_context { att_shift ) ); + ggml_build_forward_expand( + gf, + ggml_cpy( + ctx0, + wkv_state, + ggml_view_1d(ctx0, kv_self.v_l[layer_i], hparams.n_embd_v_s(), (kv_self.size - 1) * hparams.n_embd_v_s() * ggml_type_size(kv_self.k_l[layer_i]->type)) + ) + ); x_norm = llm_build_norm(ctx0, x, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i); x_prev = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); @@ -15151,7 +15161,7 @@ struct llm_build_context { ) ); if ((layer_i + 1) % hparams.rescale_every_n_layers == 0) { - x = ggml_scale_inplace(ctx0, x, 0.5F); + x = ggml_scale(ctx0, x, 0.5F); } } From 01dcf4bb7706c11bfd4754139693d2df40cfb3d3 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Fri, 9 Aug 2024 20:51:00 +0800 Subject: [PATCH 22/53] Fix parallel inferencing for RWKV Signed-off-by: Molly Sophia --- ggml/include/ggml.h | 10 ++- ggml/src/ggml.c | 157 +++++++++++++++++++++++++++++++++++++++++--- src/llama.cpp | 106 +++++++++++++++++++++--------- 3 files changed, 231 insertions(+), 42 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 39aff9e39a68a..76a3176a19608 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -513,6 +513,7 @@ extern "C" { GGML_OP_GET_REL_POS, GGML_OP_ADD_REL_POS, GGML_OP_RWKV_WKV, + GGML_OP_RWKV_TOKEN_SHIFT, GGML_OP_UNARY, @@ -1904,7 +1905,14 @@ extern "C" { struct ggml_tensor * r, struct ggml_tensor * tf, struct ggml_tensor * td, - struct ggml_tensor * state); + struct ggml_tensor * state, + struct ggml_tensor * state_seq); + + GGML_API struct ggml_tensor * ggml_rwkv_token_shift( + struct ggml_context * ctx, + struct ggml_tensor * x_carry, + struct ggml_tensor * x_norm, + struct ggml_tensor * state_seq); // custom operators diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 17e92eff2020b..06d8a8654d4ee 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2836,6 +2836,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GET_REL_POS", "ADD_REL_POS", "RWKV_WKV", + "RWKV_TOKEN_SHIFT", "UNARY", @@ -2854,7 +2855,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 79, "GGML_OP_COUNT != 79"); +static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -2928,7 +2929,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "win_unpart(x)", "get_rel_pos(x)", "add_rel_pos(x)", - "rwkv_wkv(x, k, v, r, tf, td, s)", + "rwkv_wkv(k, v, r, tf, td, s, sq)", + "rwkv_token_shift(xc, xn, sq)", "unary(x)", @@ -2947,7 +2949,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 79, "GGML_OP_COUNT != 79"); +static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -7648,35 +7650,39 @@ struct ggml_tensor * ggml_rwkv_wkv( struct ggml_tensor * r, struct ggml_tensor * tf, struct ggml_tensor * td, - struct ggml_tensor * state) { + struct ggml_tensor * state, + struct ggml_tensor * state_seq) { GGML_ASSERT(ggml_is_contiguous(k)); GGML_ASSERT(ggml_is_contiguous(v)); GGML_ASSERT(ggml_is_contiguous(r)); GGML_ASSERT(ggml_is_contiguous(tf)); GGML_ASSERT(ggml_is_contiguous(td)); GGML_ASSERT(ggml_is_contiguous(state)); + GGML_ASSERT(ggml_is_contiguous(state_seq)); + GGML_ASSERT(state_seq->type == GGML_TYPE_I32); const int64_t S = k->ne[0]; const int64_t H = k->ne[2]; const int64_t n_tokens = k->ne[3]; + const int64_t n_kv = state_seq->ne[0]; { GGML_ASSERT(k->ne[1] == 1); GGML_ASSERT(v->ne[0] == 1 && v->ne[1] == S && v->ne[2] == H && v->ne[3] == n_tokens); GGML_ASSERT(r->ne[0] == 1 && r->ne[1] == S && r->ne[2] == H && r->ne[3] == n_tokens); // TODO: RWKV v4 and v5 GGML_ASSERT(td->ne[0] == 1 && td->ne[1] == S && td->ne[2] == H && td->ne[3] == n_tokens); - GGML_ASSERT(ggml_nelements(state) == S * S * H); + GGML_ASSERT(ggml_nelements(state) == S * S * H * n_kv); } bool is_node = false; - if (k->grad || v->grad || r->grad || tf->grad || td->grad || state->grad) { + if (k->grad || v->grad || r->grad || tf->grad || td->grad || state->grad || state_seq->grad) { GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } // concat output and new_state - const int64_t ne[4] = { S * H, n_tokens + S, 1, 1 }; + const int64_t ne[4] = { S * H, n_tokens + S * n_kv, 1, 1 }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_RWKV_WKV; @@ -7687,6 +7693,48 @@ struct ggml_tensor * ggml_rwkv_wkv( result->src[3] = tf; result->src[4] = td; result->src[5] = state; + result->src[6] = state_seq; + + return result; +} + +// ggml_rwkv_token_shift + +struct ggml_tensor * ggml_rwkv_token_shift( + struct ggml_context * ctx, + struct ggml_tensor * x_carry, + struct ggml_tensor * x_norm, + struct ggml_tensor * state_seq) { + GGML_ASSERT(ggml_is_contiguous(x_carry)); + GGML_ASSERT(ggml_is_contiguous(x_norm)); + GGML_ASSERT(ggml_is_contiguous(state_seq)); + GGML_ASSERT(state_seq->type == GGML_TYPE_I32); + + const int64_t n_embd = x_norm->ne[0]; + const int64_t n_kv = state_seq->ne[0]; + const int64_t n_tokens = state_seq->ne[1]; + { + GGML_ASSERT(x_norm->ne[0] == n_embd); + GGML_ASSERT(x_norm->ne[1] == n_tokens); + GGML_ASSERT(ggml_nelements(x_carry) == n_embd * n_kv); + } + + bool is_node = false; + + if (x_carry->grad || x_norm->grad || state_seq->grad) { + GGML_ABORT("fatal error"); // TODO: implement backward + is_node = true; + } + + // concat output and new_state + const int64_t ne[4] = { n_embd, n_tokens + n_kv, 1, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_RWKV_TOKEN_SHIFT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = x_carry; + result->src[1] = x_norm; + result->src[2] = state_seq; return result; } @@ -16857,6 +16905,7 @@ static void ggml_compute_forward_rwkv_wkv_f32( const size_t T = dst->src[1]->ne[3]; const size_t C = dst->ne[0]; const size_t H = dst->src[1]->ne[2]; + const size_t n_kv = dst->src[6]->ne[0]; float * dst_data = (float *) dst->data; float * state = ((float *) dst->data) + C * T; @@ -16872,7 +16921,8 @@ static void ggml_compute_forward_rwkv_wkv_f32( float * r = (float *) dst->src[2]->data; float * time_faaaa = (float *) dst->src[3]->data; float * time_decay = (float *) dst->src[4]->data; - memcpy(state, dst->src[5]->data, (C / H) * C * sizeof(float)); + int32_t * seq_data = (int32_t *) dst->src[6]->data; + memcpy(state, dst->src[5]->data, (C / H) * C * n_kv * sizeof(float)); size_t t_stride = H * (C / H); @@ -16885,6 +16935,7 @@ static void ggml_compute_forward_rwkv_wkv_f32( // recursive through each token for (size_t t = 0; t < T; t++) { size_t t_offset = t * t_stride; + float * state_cur = state + (C / H) * C * seq_data[t * n_kv]; for (size_t h = 0; h < H; h++) { size_t h_offset = h * h_stride; @@ -16908,14 +16959,23 @@ static void ggml_compute_forward_rwkv_wkv_f32( float v_val = v[t_h_j_offset]; float kv_val = v_val * k_val; - float prev_state_val = state[h_2d_i_j_offset]; + float prev_state_val = state_cur[h_2d_i_j_offset]; float temp_val = kv_val * time_faaaa_val + prev_state_val; dst_data[t_h_j_offset] += temp_val * r_val; - state[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val; + state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val; } } } } + + for (size_t t = 0; t < T; t++) { + for (size_t kv = 1; kv < n_kv; kv++) { + int64_t seq = seq_data[t * n_kv + kv]; + if (seq >= 0 && seq_data[(t + 1) * n_kv + kv] != seq) { + memcpy(state + (C / H) * C * seq, state + (C / H) * C * seq_data[t * n_kv], (C / H) * C * sizeof(float)); + } + } + } } static void ggml_compute_forward_rwkv_wkv( @@ -16936,6 +16996,77 @@ static void ggml_compute_forward_rwkv_wkv( } } +static void ggml_compute_forward_rwkv_token_shift_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + const int64_t n_embd = dst->ne[0]; + const int64_t n_kv = dst->src[2]->ne[0]; + const int64_t n_tokens = dst->src[1]->ne[1]; + float * dst_data = (float *) dst->data; + float * x_carry = (float *) dst->src[0]->data; + float * x_norm = (float *) dst->src[1]->data; + int32_t * sq_data = (int32_t *) dst->src[2]->data; + + if (params->ith != 0) { + return; + } + + int32_t seq_start = 0; + int32_t seq_length = 0; + + for (int i1 = 0; i1 < n_kv; ++i1) { + seq_start = -1; + // assume that the tokens for each sequence are contiguous + for (int i2 = 0; i2 < n_tokens; ++i2) { + int32_t seq = sq_data[i2*n_kv]; + if (seq == i1 && seq_start < 0) { + seq_start = i2; + } + + if ((seq_start >= 0 && seq != i1) || i2 == n_tokens - 1) { + seq_length = i2 - seq_start + (i2 == n_tokens - 1); + break; + } + } + + if (seq_start >= 0) { + int32_t seq = sq_data[seq_start*n_kv]; + memcpy(dst_data + seq_start*n_embd, x_carry + seq*n_embd, n_embd*sizeof(float)); + memcpy(dst_data + (seq_start+1)*n_embd, x_norm + seq_start*n_embd, (seq_length-1)*n_embd*sizeof(float)); + } + } + + for (int i3 = 0; i3 < n_kv; ++i3) { + int32_t last_token_pos = 0; + for (int i4 = 0; i4 < n_tokens; ++i4) { + for (int i5 = 0; i5 < n_kv; ++i5) { + if (sq_data[i4*n_kv + i5] == i3) { + last_token_pos = i4; + } + } + } + memcpy(dst_data + (n_tokens + i3)*n_embd, x_norm + last_token_pos*n_embd, n_embd*sizeof(float)); + } +} + +static void ggml_compute_forward_rwkv_token_shift( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_rwkv_token_shift_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_map_unary static void ggml_compute_forward_map_unary_f32( @@ -17591,6 +17722,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_rwkv_wkv(params, tensor); } break; + case GGML_OP_RWKV_TOKEN_SHIFT: + { + ggml_compute_forward_rwkv_token_shift(params, tensor); + } break; case GGML_OP_MAP_UNARY: { ggml_unary_op_f32_t fun; @@ -18715,6 +18850,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_GET_REL_POS: case GGML_OP_ADD_REL_POS: case GGML_OP_RWKV_WKV: + case GGML_OP_RWKV_TOKEN_SHIFT: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: case GGML_OP_MAP_CUSTOM1_F32: @@ -19290,6 +19426,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_WIN_UNPART: case GGML_OP_GET_REL_POS: case GGML_OP_RWKV_WKV: + case GGML_OP_RWKV_TOKEN_SHIFT: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: case GGML_OP_MAP_CUSTOM1_F32: diff --git a/src/llama.cpp b/src/llama.cpp index 5e474d61d3550..9606ae0b98944 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9366,11 +9366,13 @@ static struct ggml_tensor * llm_build_time_mix( const struct llama_layer * layer, struct ggml_tensor * current, struct ggml_tensor * x_prev, - struct ggml_tensor ** wkv_state) { + struct ggml_tensor ** wkv_state, + struct ggml_tensor * state_seq) { size_t n_embed = current->ne[0]; size_t n_tokens = current->ne[1]; size_t head_size = layer->time_mix_first->ne[0]; size_t head_count = layer->time_mix_first->ne[1]; + size_t n_kv = state_seq->ne[0]; struct ggml_tensor * sx = ggml_sub(ctx, x_prev, current); struct ggml_tensor * xxx = ggml_add_inplace( @@ -9515,9 +9517,9 @@ static struct ggml_tensor * llm_build_time_mix( k = ggml_transpose(ctx, k); v = ggml_transpose(ctx, v); r = ggml_transpose(ctx, r); - struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); + struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state, state_seq); current = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0); - *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size, n_embed * n_tokens * sizeof(float)); + *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_kv, n_embed * n_tokens * sizeof(float)); // ggml_group_norm considers groups in the third dimension. current = ggml_reshape_4d(ctx, current, 1, 1, n_embed, n_tokens); @@ -15092,58 +15094,81 @@ struct llm_build_context { // Input embeddings, start of the model after tokenizing ({n_embd, n_tokens}) ggml_tensor * input_embeddings = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + struct ggml_tensor * state_mask = build_inp_s_mask(); + struct ggml_tensor * state_seq = build_inp_s_seq(); + ggml_tensor * x = llm_build_norm(ctx0, input_embeddings, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); for (int layer_i = 0; layer_i < n_layer; ++layer_i) { const llama_layer * layer = &model.layers[layer_i]; - // TODO: handle multiple kv cache cells - struct ggml_tensor * wkv_state = ggml_view_1d(ctx0, kv_self.v_l[layer_i], hparams.n_embd_v_s(), (kv_self.size - 1) * hparams.n_embd_v_s() * ggml_type_size(kv_self.k_l[layer_i]->type)); - struct ggml_tensor * att_shift = ggml_view_1d(ctx0, kv_self.k_l[layer_i], n_embd, (kv_self.size - 1) * 2 * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type)); - struct ggml_tensor * ffn_shift = ggml_view_1d(ctx0, kv_self.k_l[layer_i], n_embd, ((kv_self.size - 1) * 2 + 1) * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type)); + struct ggml_tensor * token_shift = ggml_reshape_2d(ctx0, kv_self.k_l[layer_i], hparams.n_embd_k_s(), kv_self.size); + struct ggml_tensor * wkv_states = ggml_reshape_2d(ctx0, kv_self.v_l[layer_i], hparams.n_embd_v_s(), kv_self.size); + + { + token_shift = ggml_mul(ctx0, + ggml_view_2d(ctx0, token_shift, token_shift->ne[0], n_kv, token_shift->nb[1], kv_head*token_shift->nb[1]), + state_mask); + wkv_states = ggml_mul(ctx0, + ggml_view_2d(ctx0, wkv_states, wkv_states->ne[0], n_kv, wkv_states->nb[1], kv_head*wkv_states->nb[1]), + state_mask); + } + + token_shift = ggml_cont( + ctx0, + ggml_permute( + ctx0, + ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_kv), + 0, 2, 1, 3 + ) + ); + + struct ggml_tensor * att_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_kv, 0); + struct ggml_tensor * ffn_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_kv, n_embd * n_kv * ggml_element_size(kv_self.k_l[layer_i])); struct ggml_tensor * x_norm = llm_build_norm(ctx0, x, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, layer_i); - struct ggml_tensor * x_prev = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - x_prev = ggml_set_1d(ctx0, x_prev, att_shift, 0); - x_prev = ggml_set_1d( + struct ggml_tensor * tmp = ggml_rwkv_token_shift(ctx0, att_shift, x_norm, state_seq); + struct ggml_tensor * x_prev = ggml_reshape_2d( ctx0, - x_prev, - ggml_view_1d(ctx0, x_norm, (n_tokens - 1) * n_embd, 0), - n_embd * ggml_type_size(x_prev->type) + ggml_view_1d(ctx0, tmp, n_embd * n_tokens, 0), + n_embd, n_tokens ); - x = ggml_add(ctx0, x, llm_build_time_mix(ctx0, layer, x_norm, x_prev, &wkv_state)); + x = ggml_add(ctx0, x, llm_build_time_mix(ctx0, layer, x_norm, x_prev, &wkv_states, state_seq)); ggml_build_forward_expand(gf, x); ggml_build_forward_expand( gf, ggml_cpy( ctx0, + wkv_states, ggml_view_1d( ctx0, - x_norm, - n_embd, - (n_tokens - 1) * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type) - ), - att_shift + kv_self.v_l[layer_i], + hparams.n_embd_v_s() * n_kv, + hparams.n_embd_v_s() * kv_head * ggml_type_size(kv_self.v_l[layer_i]->type) + ) ) ); ggml_build_forward_expand( gf, ggml_cpy( ctx0, - wkv_state, - ggml_view_1d(ctx0, kv_self.v_l[layer_i], hparams.n_embd_v_s(), (kv_self.size - 1) * hparams.n_embd_v_s() * ggml_type_size(kv_self.k_l[layer_i]->type)) + ggml_view_1d( + ctx0, + tmp, + n_embd * n_kv, + n_tokens * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type) + ), + ggml_view_1d(ctx0, token_shift, n_embd * n_kv, 0) ) ); x_norm = llm_build_norm(ctx0, x, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i); - x_prev = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - x_prev = ggml_set_1d(ctx0, x_prev, ffn_shift, 0); - x_prev = ggml_set_1d( + tmp = ggml_rwkv_token_shift(ctx0, ffn_shift, x_norm, state_seq); + x_prev = ggml_reshape_2d( ctx0, - x_prev, - ggml_view_1d(ctx0, x_norm, (n_tokens - 1) * n_embd, 0), - n_embd * ggml_type_size(x_prev->type) + ggml_view_1d(ctx0, tmp, n_embd * n_tokens, 0), + n_embd, n_tokens ); x = ggml_add(ctx0, x, llm_build_channel_mix(ctx0, layer, x_norm, x_prev)); ggml_build_forward_expand(gf, x); @@ -15153,13 +15178,32 @@ struct llm_build_context { ctx0, ggml_view_1d( ctx0, - x_norm, - n_embd, - (n_tokens - 1) * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type) + tmp, + n_embd * n_kv, + n_tokens * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type) ), - ffn_shift + ggml_view_1d(ctx0, token_shift, n_embd * n_kv, n_kv * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type)) + ) + ); + + token_shift = ggml_cont( + ctx0, + ggml_permute( + ctx0, + ggml_reshape_3d(ctx0, token_shift, n_embd, n_kv, 2), + 0, 2, 1, 3 + ) + ); + + ggml_build_forward_expand( + gf, + ggml_cpy( + ctx0, + ggml_view_1d(ctx0, token_shift, n_embd * n_kv * 2, 0), + ggml_view_1d(ctx0, kv_self.k_l[layer_i], hparams.n_embd_k_s() * n_kv, hparams.n_embd_k_s() * kv_head * ggml_type_size(kv_self.k_l[layer_i]->type)) ) ); + if ((layer_i + 1) % hparams.rescale_every_n_layers == 0) { x = ggml_scale(ctx0, x, 0.5F); } From 6ae2f4866f4d921f434c7b0f32917a00a11b4f94 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sun, 11 Aug 2024 10:13:33 +0800 Subject: [PATCH 23/53] Remove trailing whitespaces Signed-off-by: Molly Sophia --- src/llama.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 9606ae0b98944..6463309af0977 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5903,7 +5903,7 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; - case LLM_ARCH_RWKV: + case LLM_ARCH_RWKV: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size); @@ -9546,7 +9546,6 @@ static struct ggml_tensor * llm_build_channel_mix( const struct llama_layer * layer, struct ggml_tensor * current, struct ggml_tensor * x_prev) { - struct ggml_tensor * sx = ggml_sub(ctx, x_prev, current); struct ggml_tensor * xk = ggml_add_inplace( ctx, From 8bc1f9ae8057a13582135bee32b6df433543b4e1 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sun, 11 Aug 2024 12:06:16 +0800 Subject: [PATCH 24/53] build_rwkv: Avoid using inplace operations Signed-off-by: Molly Sophia --- src/llama.cpp | 146 +++++++++++++++++++++----------------------------- 1 file changed, 61 insertions(+), 85 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 6463309af0977..93f003b391b0a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9364,36 +9364,29 @@ static struct ggml_tensor * llm_build_mamba( static struct ggml_tensor * llm_build_time_mix( struct ggml_context * ctx, const struct llama_layer * layer, - struct ggml_tensor * current, + struct ggml_tensor * cur, struct ggml_tensor * x_prev, struct ggml_tensor ** wkv_state, struct ggml_tensor * state_seq) { - size_t n_embed = current->ne[0]; - size_t n_tokens = current->ne[1]; + size_t n_embed = cur->ne[0]; + size_t n_tokens = cur->ne[1]; size_t head_size = layer->time_mix_first->ne[0]; size_t head_count = layer->time_mix_first->ne[1]; size_t n_kv = state_seq->ne[0]; - struct ggml_tensor * sx = ggml_sub(ctx, x_prev, current); - struct ggml_tensor * xxx = ggml_add_inplace( - ctx, - ggml_mul(ctx, sx, layer->time_mix_lerp_x), - current - ); + struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); + struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur); xxx = ggml_reshape_4d( ctx, - ggml_tanh_inplace( + ggml_tanh( ctx, ggml_mul_mat(ctx, layer->time_mix_w1, xxx) ), layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens ); - xxx = ggml_cont( - ctx, - ggml_permute(ctx, xxx, 0, 1, 3, 2) - ); + xxx = ggml_cont(ctx, ggml_permute(ctx, xxx, 0, 1, 3, 2)); xxx = ggml_mul_mat( ctx, @@ -9415,85 +9408,85 @@ static struct ggml_tensor * llm_build_time_mix( struct ggml_tensor *mk = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); mk = ggml_reshape_2d( ctx, - ggml_set_1d_inplace(ctx, mk, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * sizeof(float)), 0), + ggml_set_1d(ctx, mk, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * sizeof(float)), 0), n_embed, n_tokens ); struct ggml_tensor *mv = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); mv = ggml_reshape_2d( ctx, - ggml_set_1d_inplace(ctx, mv, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 2 * sizeof(float)), 0), + ggml_set_1d(ctx, mv, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 2 * sizeof(float)), 0), n_embed, n_tokens ); struct ggml_tensor *mr = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); mr = ggml_reshape_2d( ctx, - ggml_set_1d_inplace(ctx, mr, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 3 * sizeof(float)), 0), + ggml_set_1d(ctx, mr, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 3 * sizeof(float)), 0), n_embed, n_tokens ); struct ggml_tensor *mg = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); mg = ggml_reshape_2d( ctx, - ggml_set_1d_inplace(ctx, mg, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 4 * sizeof(float)), 0), + ggml_set_1d(ctx, mg, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 4 * sizeof(float)), 0), n_embed, n_tokens ); - struct ggml_tensor * xw = ggml_add_inplace( + struct ggml_tensor * xw = ggml_add( ctx, - ggml_mul_inplace( + ggml_mul( ctx, ggml_add(ctx, mw, layer->time_mix_lerp_w), sx ), - current + cur ); - struct ggml_tensor * xk = ggml_add_inplace( + struct ggml_tensor * xk = ggml_add( ctx, - ggml_mul_inplace( + ggml_mul( ctx, ggml_add(ctx, mk, layer->time_mix_lerp_k), sx ), - current + cur ); - struct ggml_tensor * xv = ggml_add_inplace( + struct ggml_tensor * xv = ggml_add( ctx, - ggml_mul_inplace( + ggml_mul( ctx, ggml_add(ctx, mv, layer->time_mix_lerp_v), sx ), - current + cur ); - struct ggml_tensor * xr = ggml_add_inplace( + struct ggml_tensor * xr = ggml_add( ctx, - ggml_mul_inplace( + ggml_mul( ctx, ggml_add(ctx, mr, layer->time_mix_lerp_r), sx ), - current + cur ); - struct ggml_tensor * xg = ggml_add_inplace( + struct ggml_tensor * xg = ggml_add( ctx, - ggml_mul_inplace( + ggml_mul( ctx, ggml_add(ctx, mg, layer->time_mix_lerp_g), sx ), - current + cur ); struct ggml_tensor * r = ggml_reshape_4d(ctx, ggml_mul_mat(ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens); struct ggml_tensor * k = ggml_reshape_4d(ctx, ggml_mul_mat(ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens); struct ggml_tensor * v = ggml_reshape_4d(ctx, ggml_mul_mat(ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens); - struct ggml_tensor * g = ggml_silu_inplace( + struct ggml_tensor * g = ggml_silu( ctx, ggml_mul_mat(ctx, layer->time_mix_gate, xg) ); @@ -9501,16 +9494,12 @@ static struct ggml_tensor * llm_build_time_mix( struct ggml_tensor * w = ggml_mul_mat( ctx, layer->time_mix_decay_w2, - ggml_tanh_inplace( + ggml_tanh( ctx, ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw) ) ); - w = ggml_add_inplace( - ctx, - w, - ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed) - ); + w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed)); w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w))); w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens); @@ -9518,48 +9507,39 @@ static struct ggml_tensor * llm_build_time_mix( v = ggml_transpose(ctx, v); r = ggml_transpose(ctx, r); struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state, state_seq); - current = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0); + cur = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0); *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_kv, n_embed * n_tokens * sizeof(float)); // ggml_group_norm considers groups in the third dimension. - current = ggml_reshape_4d(ctx, current, 1, 1, n_embed, n_tokens); - current = ggml_group_norm(ctx, current, head_count, 64e-5f); + cur = ggml_reshape_4d(ctx, cur, 1, 1, n_embed, n_tokens); + cur = ggml_group_norm(ctx, cur, head_count, 64e-5f); // Convert back to a regular vector. - current = ggml_reshape_2d(ctx, current, n_embed, n_tokens); - current = ggml_add_inplace( - ctx, - ggml_mul_inplace( - ctx, - current, - layer->time_mix_ln - ), - layer->time_mix_ln_b - ); + cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); + cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); - current = ggml_mul(ctx, current, g); + cur = ggml_mul(ctx, cur, g); - return ggml_mul_mat(ctx, layer->time_mix_output, current); + return ggml_mul_mat(ctx, layer->time_mix_output, cur); } static struct ggml_tensor * llm_build_channel_mix( struct ggml_context * ctx, const struct llama_layer * layer, - struct ggml_tensor * current, + struct ggml_tensor * cur, struct ggml_tensor * x_prev) { - struct ggml_tensor * sx = ggml_sub(ctx, x_prev, current); - struct ggml_tensor * xk = ggml_add_inplace( - ctx, - ggml_mul(ctx, sx, layer->channel_mix_lerp_k), - current - ); - struct ggml_tensor * xr = ggml_add_inplace( + struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); + struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur); + struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur); + + struct ggml_tensor * r = ggml_sigmoid(ctx, ggml_mul_mat(ctx, layer->channel_mix_receptance, xr)); + struct ggml_tensor * k = ggml_sqr( ctx, - ggml_mul(ctx, sx, layer->channel_mix_lerp_r), - current + ggml_relu( + ctx, + ggml_mul_mat(ctx, layer->channel_mix_key, xk) + ) ); - struct ggml_tensor * r = ggml_sigmoid_inplace(ctx, ggml_mul_mat(ctx, layer->channel_mix_receptance, xr)); - struct ggml_tensor * k = ggml_sqr_inplace(ctx, ggml_relu_inplace(ctx, ggml_mul_mat(ctx, layer->channel_mix_key, xk))); - return ggml_mul_inplace(ctx, r, ggml_mul_mat(ctx, layer->channel_mix_value, k)); + return ggml_mul(ctx, r, ggml_mul_mat(ctx, layer->channel_mix_value, k)); } struct llm_build_context { @@ -15090,13 +15070,12 @@ struct llm_build_context { // Token shift state dimensions should be 2 * n_emb GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); - // Input embeddings, start of the model after tokenizing ({n_embd, n_tokens}) ggml_tensor * input_embeddings = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); struct ggml_tensor * state_mask = build_inp_s_mask(); struct ggml_tensor * state_seq = build_inp_s_seq(); - ggml_tensor * x = llm_build_norm(ctx0, input_embeddings, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); + ggml_tensor * cur = llm_build_norm(ctx0, input_embeddings, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); for (int layer_i = 0; layer_i < n_layer; ++layer_i) { const llama_layer * layer = &model.layers[layer_i]; @@ -15125,7 +15104,7 @@ struct llm_build_context { struct ggml_tensor * att_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_kv, 0); struct ggml_tensor * ffn_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_kv, n_embd * n_kv * ggml_element_size(kv_self.k_l[layer_i])); - struct ggml_tensor * x_norm = llm_build_norm(ctx0, x, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, layer_i); + struct ggml_tensor * x_norm = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, layer_i); struct ggml_tensor * tmp = ggml_rwkv_token_shift(ctx0, att_shift, x_norm, state_seq); struct ggml_tensor * x_prev = ggml_reshape_2d( ctx0, @@ -15133,8 +15112,8 @@ struct llm_build_context { n_embd, n_tokens ); - x = ggml_add(ctx0, x, llm_build_time_mix(ctx0, layer, x_norm, x_prev, &wkv_states, state_seq)); - ggml_build_forward_expand(gf, x); + cur = ggml_add(ctx0, cur, llm_build_time_mix(ctx0, layer, x_norm, x_prev, &wkv_states, state_seq)); + ggml_build_forward_expand(gf, cur); ggml_build_forward_expand( gf, ggml_cpy( @@ -15162,15 +15141,15 @@ struct llm_build_context { ) ); - x_norm = llm_build_norm(ctx0, x, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i); + x_norm = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i); tmp = ggml_rwkv_token_shift(ctx0, ffn_shift, x_norm, state_seq); x_prev = ggml_reshape_2d( ctx0, ggml_view_1d(ctx0, tmp, n_embd * n_tokens, 0), n_embd, n_tokens ); - x = ggml_add(ctx0, x, llm_build_channel_mix(ctx0, layer, x_norm, x_prev)); - ggml_build_forward_expand(gf, x); + cur = ggml_add(ctx0, cur, llm_build_channel_mix(ctx0, layer, x_norm, x_prev)); + ggml_build_forward_expand(gf, cur); ggml_build_forward_expand( gf, ggml_cpy( @@ -15204,21 +15183,18 @@ struct llm_build_context { ); if ((layer_i + 1) % hparams.rescale_every_n_layers == 0) { - x = ggml_scale(ctx0, x, 0.5F); + cur = ggml_scale(ctx0, cur, 0.5F); } } - // Something related to skipping tokens, specifics unclear ggml_tensor * inp_out_ids = build_inp_out_ids(); - x = ggml_get_rows(ctx0, x, inp_out_ids); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); - // Output head, convert result vector to logits - x = llm_build_norm(ctx0, x, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); - x = ggml_mul_mat(ctx0, model.output, x); + cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); + cur = ggml_mul_mat(ctx0, model.output, cur); - // Mark the output as being the result - cb(x, "result_output", -1); - ggml_build_forward_expand(gf, x); + cb(cur, "result_output", -1); + ggml_build_forward_expand(gf, cur); return gf; } From 18decea3ed978b650bd55fb218728f2f283f5cfd Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sun, 11 Aug 2024 12:19:45 +0800 Subject: [PATCH 25/53] convert_hf_to_gguf: rwkv: Avoid using ``eval`` Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a9cfa9ffb0091..9c3f81eea8b4c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3,6 +3,7 @@ from __future__ import annotations +import ast import logging import argparse import contextlib @@ -2730,12 +2731,14 @@ def set_vocab(self): with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: lines = f.readlines() for line in lines: - x = eval(line[line.index(' '):line.rindex(' ')]) - x = x.encode("utf-8") if isinstance(x, str) else x - assert isinstance(x, bytes) - assert len(x) == int(line[line.rindex(' '):]) + parts = line.split(' ') + assert len(parts) >= 3 + _, token, token_len = int(parts[0]), ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) + token = token.encode("utf-8") if isinstance(token, str) else token + assert isinstance(token, bytes) + assert len(token) == token_len token_text: str = "" - for b in x: + for b in token: token_text += f"\\x{b:02x}" tokens.append(token_text.encode("utf-8")) toktypes.append(gguf.TokenType.NORMAL) From 7f2e370fa25c4514370f07b5429a8524d7ea8e07 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 12 Aug 2024 09:08:30 +0800 Subject: [PATCH 26/53] convert_hf_to_gguf: rwkv tokenizer: Don't escape sequences manually Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9c3f81eea8b4c..48ab70ad6c1a0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2737,9 +2737,7 @@ def set_vocab(self): token = token.encode("utf-8") if isinstance(token, str) else token assert isinstance(token, bytes) assert len(token) == token_len - token_text: str = "" - for b in token: - token_text += f"\\x{b:02x}" + token_text: str = str(token)[2:-1] tokens.append(token_text.encode("utf-8")) toktypes.append(gguf.TokenType.NORMAL) remainder = vocab_size - len(tokens) From c6955525b4c6432a11d48aa03e18a1cc37092111 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 12 Aug 2024 09:12:16 +0800 Subject: [PATCH 27/53] Update convert_hf_to_gguf.py Co-authored-by: compilade --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 48ab70ad6c1a0..5eea3149f4c90 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2733,7 +2733,7 @@ def set_vocab(self): for line in lines: parts = line.split(' ') assert len(parts) >= 3 - _, token, token_len = int(parts[0]), ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) + token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) token = token.encode("utf-8") if isinstance(token, str) else token assert isinstance(token, bytes) assert len(token) == token_len From 8aa711ad986adf70e90963463ab8984661167f78 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 12 Aug 2024 09:29:47 +0800 Subject: [PATCH 28/53] ggml: Add backward computation for unary op ``exp`` Signed-off-by: Molly Sophia --- ggml/src/ggml.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 06d8a8654d4ee..a32cfcb097754 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -18843,6 +18843,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor zero_table); } } break; + case GGML_UNARY_OP_EXP: + { + if (src0->grad) { + src0->grad = ggml_add_or_set(ctx, + src0->grad, + ggml_mul(ctx, tensor, tensor->grad), + zero_table); + } + } break; default: GGML_ABORT("fatal error"); } From ae9936a80d6934ca9f80793c0803f39d541b15a4 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 12 Aug 2024 14:14:56 +0800 Subject: [PATCH 29/53] Update convert_hf_to_gguf.py Co-authored-by: compilade --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5eea3149f4c90..2f0f4d5fb8e85 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2737,7 +2737,7 @@ def set_vocab(self): token = token.encode("utf-8") if isinstance(token, str) else token assert isinstance(token, bytes) assert len(token) == token_len - token_text: str = str(token)[2:-1] + token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" tokens.append(token_text.encode("utf-8")) toktypes.append(gguf.TokenType.NORMAL) remainder = vocab_size - len(tokens) From 5afa3eff3ac35a6d9467e349eee7e5fc287ffd08 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 12 Aug 2024 14:16:02 +0800 Subject: [PATCH 30/53] Update convert_hf_to_gguf.py Co-authored-by: compilade --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2f0f4d5fb8e85..5d48980ebcb52 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2742,8 +2742,8 @@ def set_vocab(self): toktypes.append(gguf.TokenType.NORMAL) remainder = vocab_size - len(tokens) assert remainder >= 0 - for i in range(remainder): - tokens.append(f"".encode("utf-8")) + for i in range(len(tokens), vocab_size): + tokens.append(f"[PAD{i}]".encode("utf-8")) toktypes.append(gguf.TokenType.UNUSED) self.gguf_writer.add_tokenizer_model("rwkv") From 12fbe1ade2190ea2fab5b4cbf96bf0b5b3028d3a Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 12 Aug 2024 14:30:04 +0800 Subject: [PATCH 31/53] Use MODEL_ARCH.RWKV6 instead of MODEL_ARCH.RWKV Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 2 +- gguf-py/gguf/constants.py | 6 +++--- src/llama.cpp | 28 ++++++++++++++-------------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5d48980ebcb52..464de8039f0dd 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2719,7 +2719,7 @@ class StarCoder2Model(Model): @Model.register("Rwkv6ForCausalLM") class RwkvModel(Model): - model_arch = gguf.MODEL_ARCH.RWKV + model_arch = gguf.MODEL_ARCH.RWKV6 def set_vocab(self): assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 32b9024804082..ebeb200aae961 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -211,7 +211,7 @@ class MODEL_ARCH(IntEnum): GEMMA = auto() GEMMA2 = auto() STARCODER2 = auto() - RWKV = auto() + RWKV6 = auto() MAMBA = auto() XVERSE = auto() COMMAND_R = auto() @@ -365,7 +365,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.STARCODER2: "starcoder2", - MODEL_ARCH.RWKV: "rwkv", + MODEL_ARCH.RWKV6: "rwkv6", MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", @@ -908,7 +908,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], - MODEL_ARCH.RWKV: [ + MODEL_ARCH.RWKV6: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, MODEL_TENSOR.OUTPUT_NORM, diff --git a/src/llama.cpp b/src/llama.cpp index 93f003b391b0a..f28e5f7439981 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -212,7 +212,7 @@ enum llm_arch { LLM_ARCH_JAIS, LLM_ARCH_NEMOTRON, LLM_ARCH_EXAONE, - LLM_ARCH_RWKV, + LLM_ARCH_RWKV6, LLM_ARCH_UNKNOWN, }; @@ -260,7 +260,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_EXAONE, "exaone" }, - { LLM_ARCH_RWKV, "rwkv" }, + { LLM_ARCH_RWKV6, "rwkv6" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1371,7 +1371,7 @@ static const std::map> LLM_TENSOR_NA }, }, { - LLM_ARCH_RWKV, + LLM_ARCH_RWKV6, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, @@ -5903,7 +5903,7 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; - case LLM_ARCH_RWKV: + case LLM_ARCH_RWKV6: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size); @@ -8338,7 +8338,7 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; - case LLM_ARCH_RWKV: + case LLM_ARCH_RWKV6: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -9361,7 +9361,7 @@ static struct ggml_tensor * llm_build_mamba( return cur; } -static struct ggml_tensor * llm_build_time_mix( +static struct ggml_tensor * llm_build_time_mix_rwkv6( struct ggml_context * ctx, const struct llama_layer * layer, struct ggml_tensor * cur, @@ -9522,7 +9522,7 @@ static struct ggml_tensor * llm_build_time_mix( return ggml_mul_mat(ctx, layer->time_mix_output, cur); } -static struct ggml_tensor * llm_build_channel_mix( +static struct ggml_tensor * llm_build_channel_mix_rwkv6( struct ggml_context * ctx, const struct llama_layer * layer, struct ggml_tensor * cur, @@ -15064,7 +15064,7 @@ struct llm_build_context { return gf; } - ggml_cgraph * build_rwkv() { + ggml_cgraph * build_rwkv6() { ggml_cgraph *gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // Token shift state dimensions should be 2 * n_emb @@ -15112,7 +15112,7 @@ struct llm_build_context { n_embd, n_tokens ); - cur = ggml_add(ctx0, cur, llm_build_time_mix(ctx0, layer, x_norm, x_prev, &wkv_states, state_seq)); + cur = ggml_add(ctx0, cur, llm_build_time_mix_rwkv6(ctx0, layer, x_norm, x_prev, &wkv_states, state_seq)); ggml_build_forward_expand(gf, cur); ggml_build_forward_expand( gf, @@ -15148,7 +15148,7 @@ struct llm_build_context { ggml_view_1d(ctx0, tmp, n_embd * n_tokens, 0), n_embd, n_tokens ); - cur = ggml_add(ctx0, cur, llm_build_channel_mix(ctx0, layer, x_norm, x_prev)); + cur = ggml_add(ctx0, cur, llm_build_channel_mix_rwkv6(ctx0, layer, x_norm, x_prev)); ggml_build_forward_expand(gf, cur); ggml_build_forward_expand( gf, @@ -15444,9 +15444,9 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_exaone(); } break; - case LLM_ARCH_RWKV: + case LLM_ARCH_RWKV6: { - result = llm.build_rwkv(); + result = llm.build_rwkv6(); } break; default: GGML_ABORT("fatal error"); @@ -18477,7 +18477,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_T5: case LLM_ARCH_T5ENCODER: case LLM_ARCH_JAIS: - case LLM_ARCH_RWKV: + case LLM_ARCH_RWKV6: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values @@ -18646,7 +18646,7 @@ llama_token llama_model_decoder_start_token(const struct llama_model * model) { bool llama_model_is_recurrent(const struct llama_model * model) { switch (model->arch) { case LLM_ARCH_MAMBA: return true; - case LLM_ARCH_RWKV: return true; + case LLM_ARCH_RWKV6: return true; default: return false; } } From 276d53b18f54cec929a8ef63a807f3728477fdc1 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 12 Aug 2024 14:47:26 +0800 Subject: [PATCH 32/53] build_rwkv6: Simplify graph Signed-off-by: Molly Sophia --- src/llama.cpp | 46 ++++++++++++---------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index f28e5f7439981..b085daeb91c2b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9398,40 +9398,18 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( xxx ); - struct ggml_tensor *mw = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); - mw = ggml_reshape_2d( - ctx, - ggml_set_1d(ctx, mw, ggml_view_1d(ctx, xxx, n_embed * n_tokens, 0), 0), - n_embed, n_tokens - ); - - struct ggml_tensor *mk = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); - mk = ggml_reshape_2d( - ctx, - ggml_set_1d(ctx, mk, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * sizeof(float)), 0), - n_embed, n_tokens - ); - - struct ggml_tensor *mv = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); - mv = ggml_reshape_2d( - ctx, - ggml_set_1d(ctx, mv, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 2 * sizeof(float)), 0), - n_embed, n_tokens - ); - - struct ggml_tensor *mr = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); - mr = ggml_reshape_2d( - ctx, - ggml_set_1d(ctx, mr, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 3 * sizeof(float)), 0), - n_embed, n_tokens - ); - - struct ggml_tensor *mg = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); - mg = ggml_reshape_2d( - ctx, - ggml_set_1d(ctx, mg, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 4 * sizeof(float)), 0), - n_embed, n_tokens - ); + // struct ggml_tensor *mw = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); + // mw = ggml_reshape_2d( + // ctx, + // ggml_set_1d(ctx, mw, ggml_view_1d(ctx, xxx, n_embed * n_tokens, 0), 0), + // n_embed, n_tokens + // ); + + struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0); + struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float)); + struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float)); + struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float)); + struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float)); struct ggml_tensor * xw = ggml_add( ctx, From b0f4fe5279b65fa87d199b456031360d7b088a4d Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Tue, 13 Aug 2024 17:01:44 +0800 Subject: [PATCH 33/53] llama: rwkv6: Detect model.type Signed-off-by: Molly Sophia --- src/llama.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index b085daeb91c2b..e0d395c618e89 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2216,6 +2216,7 @@ enum e_model { MODEL_1B, MODEL_1_3B, MODEL_1_4B, + MODEL_1_6B, MODEL_2B, MODEL_2_8B, MODEL_3B, @@ -5908,6 +5909,18 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size); ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); + + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1_6B; break; + case 32: + switch (hparams.n_embd) { + case 2560: model.type = e_model::MODEL_3B; break; + case 4096: model.type = e_model::MODEL_7B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 61: model.type = e_model::MODEL_14B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } } break; default: (void)0; } From 683d70cb686668f53684d935c0d999f97132918c Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Tue, 13 Aug 2024 17:06:07 +0800 Subject: [PATCH 34/53] llama: rwkv6: Fix tensor loading for 7B/14B models Signed-off-by: Molly Sophia --- src/llama.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index e0d395c618e89..1fd91fcd7f921 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8364,10 +8364,9 @@ static bool llm_load_tensors( model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); - // TODO: Parameterize this - const int time_mix_extra_dim = 32; - const int time_decay_extra_dim = 64; - const int head_size = 64; + const int time_mix_extra_dim = (n_embd == 4096) ? 64 : 32; + const int time_decay_extra_dim = (n_embd == 4096) ? 128 : 64; + const int head_size = hparams.wkv_head_size; const int attn_hidden_size = n_embd; const int ffn_size = (int)(n_embd * 3.5 / 32) * 32; From ee1b78c0911476d80517d76e602820795649d746 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Tue, 13 Aug 2024 17:41:34 +0800 Subject: [PATCH 35/53] llama: rwkv6: Fix group_norm assertion failure with Metal Signed-off-by: Molly Sophia --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 1fd91fcd7f921..67889111a4070 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9501,7 +9501,7 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_kv, n_embed * n_tokens * sizeof(float)); // ggml_group_norm considers groups in the third dimension. - cur = ggml_reshape_4d(ctx, cur, 1, 1, n_embed, n_tokens); + cur = ggml_reshape_4d(ctx, cur, n_embed / head_count, 1, head_count, n_tokens); cur = ggml_group_norm(ctx, cur, head_count, 64e-5f); // Convert back to a regular vector. cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); From c165e346297f955b13649a83cef84b507d3785c2 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Tue, 13 Aug 2024 17:46:29 +0800 Subject: [PATCH 36/53] llama: rwkv6: Clean up Signed-off-by: Molly Sophia --- src/llama.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 67889111a4070..a65678fc25293 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9410,13 +9410,6 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( xxx ); - // struct ggml_tensor *mw = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens); - // mw = ggml_reshape_2d( - // ctx, - // ggml_set_1d(ctx, mw, ggml_view_1d(ctx, xxx, n_embed * n_tokens, 0), 0), - // n_embed, n_tokens - // ); - struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0); struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float)); struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float)); From 6da6aa48b0dbcb3ec392d94538711aa2ba12aa4e Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Tue, 13 Aug 2024 18:31:25 +0800 Subject: [PATCH 37/53] llama: rwkv6: Add quantization tensor exclusion Signed-off-by: Molly Sophia --- src/llama.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index a65678fc25293..a6f6ef124da44 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17469,6 +17469,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // NOTE: can't use LLM_TN here because the layer number is not known quantize &= name.find("ssm_conv1d.weight") == std::string::npos; + // do not quantize RWKV's time_mix_first tensors + quantize &= name.find("time_mix_first.weight") == std::string::npos; + // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos; From f5d955d2fe0994e57e8975fcc76b6d8350e8400d Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Fri, 23 Aug 2024 10:14:35 +0800 Subject: [PATCH 38/53] llama: rwkv6: Use the new advanced batch splits Signed-off-by: Molly Sophia --- ggml/include/ggml.h | 10 +-- ggml/src/ggml.c | 156 ++++---------------------------------------- src/llama.cpp | 104 +++++++++++++++-------------- 3 files changed, 66 insertions(+), 204 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 76a3176a19608..39aff9e39a68a 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -513,7 +513,6 @@ extern "C" { GGML_OP_GET_REL_POS, GGML_OP_ADD_REL_POS, GGML_OP_RWKV_WKV, - GGML_OP_RWKV_TOKEN_SHIFT, GGML_OP_UNARY, @@ -1905,14 +1904,7 @@ extern "C" { struct ggml_tensor * r, struct ggml_tensor * tf, struct ggml_tensor * td, - struct ggml_tensor * state, - struct ggml_tensor * state_seq); - - GGML_API struct ggml_tensor * ggml_rwkv_token_shift( - struct ggml_context * ctx, - struct ggml_tensor * x_carry, - struct ggml_tensor * x_norm, - struct ggml_tensor * state_seq); + struct ggml_tensor * state); // custom operators diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a32cfcb097754..93f3933e75127 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2836,7 +2836,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GET_REL_POS", "ADD_REL_POS", "RWKV_WKV", - "RWKV_TOKEN_SHIFT", "UNARY", @@ -2855,7 +2854,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80"); +static_assert(GGML_OP_COUNT == 79, "GGML_OP_COUNT != 79"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -2929,8 +2928,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "win_unpart(x)", "get_rel_pos(x)", "add_rel_pos(x)", - "rwkv_wkv(k, v, r, tf, td, s, sq)", - "rwkv_token_shift(xc, xn, sq)", + "rwkv_wkv(k, v, r, tf, td, s)", "unary(x)", @@ -2949,7 +2947,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80"); +static_assert(GGML_OP_COUNT == 79, "GGML_OP_COUNT != 79"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -7650,39 +7648,36 @@ struct ggml_tensor * ggml_rwkv_wkv( struct ggml_tensor * r, struct ggml_tensor * tf, struct ggml_tensor * td, - struct ggml_tensor * state, - struct ggml_tensor * state_seq) { + struct ggml_tensor * state) { GGML_ASSERT(ggml_is_contiguous(k)); GGML_ASSERT(ggml_is_contiguous(v)); GGML_ASSERT(ggml_is_contiguous(r)); GGML_ASSERT(ggml_is_contiguous(tf)); GGML_ASSERT(ggml_is_contiguous(td)); GGML_ASSERT(ggml_is_contiguous(state)); - GGML_ASSERT(ggml_is_contiguous(state_seq)); - GGML_ASSERT(state_seq->type == GGML_TYPE_I32); const int64_t S = k->ne[0]; const int64_t H = k->ne[2]; const int64_t n_tokens = k->ne[3]; - const int64_t n_kv = state_seq->ne[0]; + const int64_t n_seqs = state->ne[1]; { GGML_ASSERT(k->ne[1] == 1); GGML_ASSERT(v->ne[0] == 1 && v->ne[1] == S && v->ne[2] == H && v->ne[3] == n_tokens); GGML_ASSERT(r->ne[0] == 1 && r->ne[1] == S && r->ne[2] == H && r->ne[3] == n_tokens); // TODO: RWKV v4 and v5 GGML_ASSERT(td->ne[0] == 1 && td->ne[1] == S && td->ne[2] == H && td->ne[3] == n_tokens); - GGML_ASSERT(ggml_nelements(state) == S * S * H * n_kv); + GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs); } bool is_node = false; - if (k->grad || v->grad || r->grad || tf->grad || td->grad || state->grad || state_seq->grad) { + if (k->grad || v->grad || r->grad || tf->grad || td->grad || state->grad) { GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } // concat output and new_state - const int64_t ne[4] = { S * H, n_tokens + S * n_kv, 1, 1 }; + const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_RWKV_WKV; @@ -7693,48 +7688,6 @@ struct ggml_tensor * ggml_rwkv_wkv( result->src[3] = tf; result->src[4] = td; result->src[5] = state; - result->src[6] = state_seq; - - return result; -} - -// ggml_rwkv_token_shift - -struct ggml_tensor * ggml_rwkv_token_shift( - struct ggml_context * ctx, - struct ggml_tensor * x_carry, - struct ggml_tensor * x_norm, - struct ggml_tensor * state_seq) { - GGML_ASSERT(ggml_is_contiguous(x_carry)); - GGML_ASSERT(ggml_is_contiguous(x_norm)); - GGML_ASSERT(ggml_is_contiguous(state_seq)); - GGML_ASSERT(state_seq->type == GGML_TYPE_I32); - - const int64_t n_embd = x_norm->ne[0]; - const int64_t n_kv = state_seq->ne[0]; - const int64_t n_tokens = state_seq->ne[1]; - { - GGML_ASSERT(x_norm->ne[0] == n_embd); - GGML_ASSERT(x_norm->ne[1] == n_tokens); - GGML_ASSERT(ggml_nelements(x_carry) == n_embd * n_kv); - } - - bool is_node = false; - - if (x_carry->grad || x_norm->grad || state_seq->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - - // concat output and new_state - const int64_t ne[4] = { n_embd, n_tokens + n_kv, 1, 1 }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - - result->op = GGML_OP_RWKV_TOKEN_SHIFT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = x_carry; - result->src[1] = x_norm; - result->src[2] = state_seq; return result; } @@ -16905,7 +16858,7 @@ static void ggml_compute_forward_rwkv_wkv_f32( const size_t T = dst->src[1]->ne[3]; const size_t C = dst->ne[0]; const size_t H = dst->src[1]->ne[2]; - const size_t n_kv = dst->src[6]->ne[0]; + const size_t n_seqs = dst->src[5]->ne[1]; float * dst_data = (float *) dst->data; float * state = ((float *) dst->data) + C * T; @@ -16921,8 +16874,7 @@ static void ggml_compute_forward_rwkv_wkv_f32( float * r = (float *) dst->src[2]->data; float * time_faaaa = (float *) dst->src[3]->data; float * time_decay = (float *) dst->src[4]->data; - int32_t * seq_data = (int32_t *) dst->src[6]->data; - memcpy(state, dst->src[5]->data, (C / H) * C * n_kv * sizeof(float)); + memcpy(state, dst->src[5]->data, (C / H) * C * n_seqs * sizeof(float)); size_t t_stride = H * (C / H); @@ -16935,7 +16887,7 @@ static void ggml_compute_forward_rwkv_wkv_f32( // recursive through each token for (size_t t = 0; t < T; t++) { size_t t_offset = t * t_stride; - float * state_cur = state + (C / H) * C * seq_data[t * n_kv]; + float * state_cur = state + (C / H) * C * (t / (T / n_seqs)); for (size_t h = 0; h < H; h++) { size_t h_offset = h * h_stride; @@ -16967,15 +16919,6 @@ static void ggml_compute_forward_rwkv_wkv_f32( } } } - - for (size_t t = 0; t < T; t++) { - for (size_t kv = 1; kv < n_kv; kv++) { - int64_t seq = seq_data[t * n_kv + kv]; - if (seq >= 0 && seq_data[(t + 1) * n_kv + kv] != seq) { - memcpy(state + (C / H) * C * seq, state + (C / H) * C * seq_data[t * n_kv], (C / H) * C * sizeof(float)); - } - } - } } static void ggml_compute_forward_rwkv_wkv( @@ -16996,77 +16939,6 @@ static void ggml_compute_forward_rwkv_wkv( } } -static void ggml_compute_forward_rwkv_token_shift_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - const int64_t n_embd = dst->ne[0]; - const int64_t n_kv = dst->src[2]->ne[0]; - const int64_t n_tokens = dst->src[1]->ne[1]; - float * dst_data = (float *) dst->data; - float * x_carry = (float *) dst->src[0]->data; - float * x_norm = (float *) dst->src[1]->data; - int32_t * sq_data = (int32_t *) dst->src[2]->data; - - if (params->ith != 0) { - return; - } - - int32_t seq_start = 0; - int32_t seq_length = 0; - - for (int i1 = 0; i1 < n_kv; ++i1) { - seq_start = -1; - // assume that the tokens for each sequence are contiguous - for (int i2 = 0; i2 < n_tokens; ++i2) { - int32_t seq = sq_data[i2*n_kv]; - if (seq == i1 && seq_start < 0) { - seq_start = i2; - } - - if ((seq_start >= 0 && seq != i1) || i2 == n_tokens - 1) { - seq_length = i2 - seq_start + (i2 == n_tokens - 1); - break; - } - } - - if (seq_start >= 0) { - int32_t seq = sq_data[seq_start*n_kv]; - memcpy(dst_data + seq_start*n_embd, x_carry + seq*n_embd, n_embd*sizeof(float)); - memcpy(dst_data + (seq_start+1)*n_embd, x_norm + seq_start*n_embd, (seq_length-1)*n_embd*sizeof(float)); - } - } - - for (int i3 = 0; i3 < n_kv; ++i3) { - int32_t last_token_pos = 0; - for (int i4 = 0; i4 < n_tokens; ++i4) { - for (int i5 = 0; i5 < n_kv; ++i5) { - if (sq_data[i4*n_kv + i5] == i3) { - last_token_pos = i4; - } - } - } - memcpy(dst_data + (n_tokens + i3)*n_embd, x_norm + last_token_pos*n_embd, n_embd*sizeof(float)); - } -} - -static void ggml_compute_forward_rwkv_token_shift( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_rwkv_token_shift_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - // ggml_compute_forward_map_unary static void ggml_compute_forward_map_unary_f32( @@ -17722,10 +17594,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_rwkv_wkv(params, tensor); } break; - case GGML_OP_RWKV_TOKEN_SHIFT: - { - ggml_compute_forward_rwkv_token_shift(params, tensor); - } break; case GGML_OP_MAP_UNARY: { ggml_unary_op_f32_t fun; @@ -18859,7 +18727,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_GET_REL_POS: case GGML_OP_ADD_REL_POS: case GGML_OP_RWKV_WKV: - case GGML_OP_RWKV_TOKEN_SHIFT: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: case GGML_OP_MAP_CUSTOM1_F32: @@ -19435,7 +19302,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_WIN_UNPART: case GGML_OP_GET_REL_POS: case GGML_OP_RWKV_WKV: - case GGML_OP_RWKV_TOKEN_SHIFT: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: case GGML_OP_MAP_CUSTOM1_F32: diff --git a/src/llama.cpp b/src/llama.cpp index a6f6ef124da44..f8ec0e323e1c4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9378,15 +9378,20 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( const struct llama_layer * layer, struct ggml_tensor * cur, struct ggml_tensor * x_prev, - struct ggml_tensor ** wkv_state, - struct ggml_tensor * state_seq) { + struct ggml_tensor ** wkv_state) { size_t n_embed = cur->ne[0]; - size_t n_tokens = cur->ne[1]; + size_t n_seq_tokens = cur->ne[1]; + size_t n_seqs = cur->ne[2]; size_t head_size = layer->time_mix_first->ne[0]; size_t head_count = layer->time_mix_first->ne[1]; - size_t n_kv = state_seq->ne[0]; + + size_t n_tokens = n_seqs * n_seq_tokens; struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); + + sx = ggml_reshape_2d(ctx, sx, n_embed, n_tokens); + cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); + struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur); xxx = ggml_reshape_4d( @@ -9489,9 +9494,9 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( k = ggml_transpose(ctx, k); v = ggml_transpose(ctx, v); r = ggml_transpose(ctx, r); - struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state, state_seq); + struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); cur = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0); - *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_kv, n_embed * n_tokens * sizeof(float)); + *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float)); // ggml_group_norm considers groups in the third dimension. cur = ggml_reshape_4d(ctx, cur, n_embed / head_count, 1, head_count, n_tokens); @@ -9501,8 +9506,9 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); cur = ggml_mul(ctx, cur, g); + cur = ggml_mul_mat(ctx, layer->time_mix_output, cur); - return ggml_mul_mat(ctx, layer->time_mix_output, cur); + return ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs); } static struct ggml_tensor * llm_build_channel_mix_rwkv6( @@ -15053,49 +15059,56 @@ struct llm_build_context { // Token shift state dimensions should be 2 * n_emb GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); + const int64_t n_seqs = batch.n_seqs; + const int64_t n_seq_tokens = batch.n_seq_tokens; + const int64_t n_tokens = batch.n_tokens; + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(batch.equal_seqs); + GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); + ggml_tensor * input_embeddings = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + struct ggml_tensor * state_copy = build_inp_s_copy(); struct ggml_tensor * state_mask = build_inp_s_mask(); - struct ggml_tensor * state_seq = build_inp_s_seq(); ggml_tensor * cur = llm_build_norm(ctx0, input_embeddings, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); for (int layer_i = 0; layer_i < n_layer; ++layer_i) { const llama_layer * layer = &model.layers[layer_i]; - struct ggml_tensor * token_shift = ggml_reshape_2d(ctx0, kv_self.k_l[layer_i], hparams.n_embd_k_s(), kv_self.size); - struct ggml_tensor * wkv_states = ggml_reshape_2d(ctx0, kv_self.v_l[layer_i], hparams.n_embd_v_s(), kv_self.size); + // (ab)using the KV cache to store the states + struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0, + gf, kv_self.k_l[layer_i], state_copy, state_mask, + hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs); + struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0, + gf, kv_self.v_l[layer_i], state_copy, state_mask, + hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs); - { - token_shift = ggml_mul(ctx0, - ggml_view_2d(ctx0, token_shift, token_shift->ne[0], n_kv, token_shift->nb[1], kv_head*token_shift->nb[1]), - state_mask); - wkv_states = ggml_mul(ctx0, - ggml_view_2d(ctx0, wkv_states, wkv_states->ne[0], n_kv, wkv_states->nb[1], kv_head*wkv_states->nb[1]), - state_mask); - } + cur = ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs); token_shift = ggml_cont( ctx0, ggml_permute( ctx0, - ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_kv), + ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs), 0, 2, 1, 3 ) ); - struct ggml_tensor * att_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_kv, 0); - struct ggml_tensor * ffn_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_kv, n_embd * n_kv * ggml_element_size(kv_self.k_l[layer_i])); + struct ggml_tensor * att_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_seqs, 0); + struct ggml_tensor * ffn_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_seqs, n_embd * n_seqs * ggml_element_size(token_shift)); + att_shift = ggml_reshape_3d(ctx0, att_shift, n_embd, 1, n_seqs); + ffn_shift = ggml_reshape_3d(ctx0, ffn_shift, n_embd, 1, n_seqs); struct ggml_tensor * x_norm = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, layer_i); - struct ggml_tensor * tmp = ggml_rwkv_token_shift(ctx0, att_shift, x_norm, state_seq); - struct ggml_tensor * x_prev = ggml_reshape_2d( + struct ggml_tensor * x_prev = ggml_concat( ctx0, - ggml_view_1d(ctx0, tmp, n_embd * n_tokens, 0), - n_embd, n_tokens + att_shift, + ggml_view_3d(ctx0, x_norm, n_embd, n_seq_tokens - 1, n_seqs, x_norm->nb[1], x_norm->nb[2], 0), + 1 ); - cur = ggml_add(ctx0, cur, llm_build_time_mix_rwkv6(ctx0, layer, x_norm, x_prev, &wkv_states, state_seq)); + cur = ggml_add(ctx0, cur, llm_build_time_mix_rwkv6(ctx0, layer, x_norm, x_prev, &wkv_states)); ggml_build_forward_expand(gf, cur); ggml_build_forward_expand( gf, @@ -15105,45 +15118,35 @@ struct llm_build_context { ggml_view_1d( ctx0, kv_self.v_l[layer_i], - hparams.n_embd_v_s() * n_kv, + hparams.n_embd_v_s() * n_seqs, hparams.n_embd_v_s() * kv_head * ggml_type_size(kv_self.v_l[layer_i]->type) ) ) ); + struct ggml_tensor * last_norm = ggml_view_3d(ctx0, x_norm, n_embd, 1, n_seqs, x_norm->nb[1], x_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm)); ggml_build_forward_expand( gf, ggml_cpy( - ctx0, - ggml_view_1d( - ctx0, - tmp, - n_embd * n_kv, - n_tokens * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type) - ), - ggml_view_1d(ctx0, token_shift, n_embd * n_kv, 0) + ctx0, last_norm, + ggml_view_1d(ctx0, token_shift, n_embd * n_seqs, 0) ) ); x_norm = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i); - tmp = ggml_rwkv_token_shift(ctx0, ffn_shift, x_norm, state_seq); - x_prev = ggml_reshape_2d( + x_prev = ggml_concat( ctx0, - ggml_view_1d(ctx0, tmp, n_embd * n_tokens, 0), - n_embd, n_tokens + ffn_shift, + ggml_view_3d(ctx0, x_norm, n_embd, n_seq_tokens - 1, n_seqs, x_norm->nb[1], x_norm->nb[2], 0), + 1 ); cur = ggml_add(ctx0, cur, llm_build_channel_mix_rwkv6(ctx0, layer, x_norm, x_prev)); + last_norm = ggml_view_3d(ctx0, x_norm, n_embd, 1, n_seqs, x_norm->nb[1], x_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm)); ggml_build_forward_expand(gf, cur); ggml_build_forward_expand( gf, ggml_cpy( - ctx0, - ggml_view_1d( - ctx0, - tmp, - n_embd * n_kv, - n_tokens * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type) - ), - ggml_view_1d(ctx0, token_shift, n_embd * n_kv, n_kv * n_embd * ggml_type_size(kv_self.k_l[layer_i]->type)) + ctx0, last_norm, + ggml_view_1d(ctx0, token_shift, n_embd * n_seqs, n_embd * n_seqs * ggml_element_size(token_shift)) ) ); @@ -15151,7 +15154,7 @@ struct llm_build_context { ctx0, ggml_permute( ctx0, - ggml_reshape_3d(ctx0, token_shift, n_embd, n_kv, 2), + ggml_reshape_3d(ctx0, token_shift, n_embd, n_seqs, 2), 0, 2, 1, 3 ) ); @@ -15160,8 +15163,8 @@ struct llm_build_context { gf, ggml_cpy( ctx0, - ggml_view_1d(ctx0, token_shift, n_embd * n_kv * 2, 0), - ggml_view_1d(ctx0, kv_self.k_l[layer_i], hparams.n_embd_k_s() * n_kv, hparams.n_embd_k_s() * kv_head * ggml_type_size(kv_self.k_l[layer_i]->type)) + ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0), + ggml_view_1d(ctx0, kv_self.k_l[layer_i], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_type_size(kv_self.k_l[layer_i]->type)) ) ); @@ -15171,6 +15174,7 @@ struct llm_build_context { } ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); cur = ggml_get_rows(ctx0, cur, inp_out_ids); cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); From 57decb4a38c3327f8cba0251a822d7af6e82be97 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sun, 25 Aug 2024 12:10:02 +0800 Subject: [PATCH 39/53] Update src/llama.cpp Co-authored-by: compilade --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index f8ec0e323e1c4..8b2b920d23f14 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18636,7 +18636,7 @@ llama_token llama_model_decoder_start_token(const struct llama_model * model) { bool llama_model_is_recurrent(const struct llama_model * model) { switch (model->arch) { case LLM_ARCH_MAMBA: return true; - case LLM_ARCH_RWKV6: return true; + case LLM_ARCH_RWKV6: return true; default: return false; } } From e94778ade0c7224a15989563adf2ed4a7a046a8c Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sun, 25 Aug 2024 12:36:29 +0800 Subject: [PATCH 40/53] llama: rwkv6: Use ``ggml_norm`` instead of ``ggml_group_norm`` Co-authored-by: compilade --- src/llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 8b2b920d23f14..68d53672dcae9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9498,10 +9498,10 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( cur = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0); *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float)); - // ggml_group_norm considers groups in the third dimension. - cur = ggml_reshape_4d(ctx, cur, n_embed / head_count, 1, head_count, n_tokens); - cur = ggml_group_norm(ctx, cur, head_count, 64e-5f); - // Convert back to a regular vector. + // group norm with head_count groups + cur = ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens); + cur = ggml_norm(ctx, cur, 64e-5f); + // Convert back to regular vectors. cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); From 7756afd8ddd5729fd3d9630cd03695f75f180e78 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sun, 25 Aug 2024 15:48:35 +0800 Subject: [PATCH 41/53] llama: rwkv6: Apply code style and misc changes Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 2 ++ src/llama.cpp | 82 ++++++++++++++++++------------------------- 2 files changed, 37 insertions(+), 47 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 464de8039f0dd..44213f2f33d5a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -299,6 +299,7 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.POS_EMBD, gguf.MODEL_TENSOR.TOKEN_TYPES, gguf.MODEL_TENSOR.SSM_CONV1D, + gguf.MODEL_TENSOR.TIME_MIX_FIRST, ) ) or not name.endswith(".weight") @@ -2764,6 +2765,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(layer_norm_eps) self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_file_type(self.ftype) # required by llama.cpp, unused self.gguf_writer.add_head_count(0) diff --git a/src/llama.cpp b/src/llama.cpp index 68d53672dcae9..5d5a48dbd0a90 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5161,6 +5161,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_1B: return "1B"; case MODEL_1_3B: return "1.3B"; case MODEL_1_4B: return "1.4B"; + case MODEL_1_6B: return "1.6B"; case MODEL_2B: return "2B"; case MODEL_2_8B: return "2.8B"; case MODEL_3B: return "3B"; @@ -15066,49 +15067,40 @@ struct llm_build_context { GGML_ASSERT(batch.equal_seqs); GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); - ggml_tensor * input_embeddings = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - + struct ggml_tensor * cur; + struct ggml_tensor * inpL; struct ggml_tensor * state_copy = build_inp_s_copy(); struct ggml_tensor * state_mask = build_inp_s_mask(); - ggml_tensor * cur = llm_build_norm(ctx0, input_embeddings, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); - for (int layer_i = 0; layer_i < n_layer; ++layer_i) { - const llama_layer * layer = &model.layers[layer_i]; + for (int il = 0; il < n_layer; ++il) { + const llama_layer * layer = &model.layers[il]; // (ab)using the KV cache to store the states struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0, - gf, kv_self.k_l[layer_i], state_copy, state_mask, + gf, kv_self.k_l[il], state_copy, state_mask, hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs); struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0, - gf, kv_self.v_l[layer_i], state_copy, state_mask, + gf, kv_self.v_l[il], state_copy, state_mask, hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs); - cur = ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs); + cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); + token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs); - token_shift = ggml_cont( - ctx0, - ggml_permute( - ctx0, - ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs), - 0, 2, 1, 3 - ) - ); + struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); + struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); - struct ggml_tensor * att_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_seqs, 0); - struct ggml_tensor * ffn_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_seqs, n_embd * n_seqs * ggml_element_size(token_shift)); - att_shift = ggml_reshape_3d(ctx0, att_shift, n_embd, 1, n_seqs); - ffn_shift = ggml_reshape_3d(ctx0, ffn_shift, n_embd, 1, n_seqs); - - struct ggml_tensor * x_norm = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, layer_i); + struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il); struct ggml_tensor * x_prev = ggml_concat( ctx0, att_shift, - ggml_view_3d(ctx0, x_norm, n_embd, n_seq_tokens - 1, n_seqs, x_norm->nb[1], x_norm->nb[2], 0), + ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), 1 ); - cur = ggml_add(ctx0, cur, llm_build_time_mix_rwkv6(ctx0, layer, x_norm, x_prev, &wkv_states)); + cur = ggml_add(ctx0, cur, llm_build_time_mix_rwkv6(ctx0, layer, x_norm_att, x_prev, &wkv_states)); ggml_build_forward_expand(gf, cur); ggml_build_forward_expand( gf, @@ -15117,38 +15109,22 @@ struct llm_build_context { wkv_states, ggml_view_1d( ctx0, - kv_self.v_l[layer_i], + kv_self.v_l[il], hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_type_size(kv_self.v_l[layer_i]->type) + hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) ) ) ); - struct ggml_tensor * last_norm = ggml_view_3d(ctx0, x_norm, n_embd, 1, n_seqs, x_norm->nb[1], x_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm)); - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, last_norm, - ggml_view_1d(ctx0, token_shift, n_embd * n_seqs, 0) - ) - ); - x_norm = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i); + ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il); x_prev = ggml_concat( ctx0, ffn_shift, - ggml_view_3d(ctx0, x_norm, n_embd, n_seq_tokens - 1, n_seqs, x_norm->nb[1], x_norm->nb[2], 0), + ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0), 1 ); - cur = ggml_add(ctx0, cur, llm_build_channel_mix_rwkv6(ctx0, layer, x_norm, x_prev)); - last_norm = ggml_view_3d(ctx0, x_norm, n_embd, 1, n_seqs, x_norm->nb[1], x_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm)); + cur = ggml_add(ctx0, cur, llm_build_channel_mix_rwkv6(ctx0, layer, x_norm_ffn, x_prev)); ggml_build_forward_expand(gf, cur); - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, last_norm, - ggml_view_1d(ctx0, token_shift, n_embd * n_seqs, n_embd * n_seqs * ggml_element_size(token_shift)) - ) - ); token_shift = ggml_cont( ctx0, @@ -15159,20 +15135,32 @@ struct llm_build_context { ) ); + struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); + struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn)); + + token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1); + ggml_build_forward_expand( gf, ggml_cpy( ctx0, ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0), - ggml_view_1d(ctx0, kv_self.k_l[layer_i], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_type_size(kv_self.k_l[layer_i]->type)) + ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) ) ); - if ((layer_i + 1) % hparams.rescale_every_n_layers == 0) { + if ((il + 1) % hparams.rescale_every_n_layers == 0) { cur = ggml_scale(ctx0, cur, 0.5F); } + + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } + cur = inpL; ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); cur = ggml_get_rows(ctx0, cur, inp_out_ids); From 87a29014a44e9d290f48032a50b84cdc25a925a4 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sun, 25 Aug 2024 15:56:43 +0800 Subject: [PATCH 42/53] converter: Use class name ``Rwkv6Model`` Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 44213f2f33d5a..b1e7992f8047c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2719,7 +2719,7 @@ class StarCoder2Model(Model): @Model.register("Rwkv6ForCausalLM") -class RwkvModel(Model): +class Rwkv6Model(Model): model_arch = gguf.MODEL_ARCH.RWKV6 def set_vocab(self): From c414a24a5a02ca8c573e9f9cea569bf33fe4d8cb Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sun, 25 Aug 2024 16:16:29 +0800 Subject: [PATCH 43/53] llama: rwkv6: Make use of key ``feed_forward_length`` Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 3 ++- src/llama.cpp | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b1e7992f8047c..941697d6d6894 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2757,6 +2757,7 @@ def set_gguf_parameters(self): hidden_size = self.hparams["hidden_size"] layer_norm_eps = self.hparams["layer_norm_epsilon"] rescale_every_n_layers = self.hparams["rescale_every"] + intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32) # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) @@ -2765,11 +2766,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(layer_norm_eps) self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_feed_forward_length(intermediate_size) self.gguf_writer.add_file_type(self.ftype) # required by llama.cpp, unused self.gguf_writer.add_head_count(0) - self.gguf_writer.add_feed_forward_length(0) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: new_name = self.map_tensor_name(name) diff --git a/src/llama.cpp b/src/llama.cpp index 5d5a48dbd0a90..a0b9dbc87c742 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8369,7 +8369,7 @@ static bool llm_load_tensors( const int time_decay_extra_dim = (n_embd == 4096) ? 128 : 64; const int head_size = hparams.wkv_head_size; const int attn_hidden_size = n_embd; - const int ffn_size = (int)(n_embd * 3.5 / 32) * 32; + const int ffn_size = hparams.n_ff_arr[0]; for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); @@ -8392,7 +8392,6 @@ static bool llm_load_tensors( layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}); layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}); - // TODO: Parametrize hardcoded dimensions for first & decay layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}); layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}); layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}); From 6d69fd77b1f6d61b12eb9690b9842f245de63151 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sun, 25 Aug 2024 16:26:57 +0800 Subject: [PATCH 44/53] llama: rwkv6: Add kv ``time_mix_extra_dim`` and ``time_decay_extra_dim`` Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 4 ++++ gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/gguf_writer.py | 6 ++++++ src/llama.cpp | 14 ++++++++++++-- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 941697d6d6894..a39f2dc7398a0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2758,6 +2758,8 @@ def set_gguf_parameters(self): layer_norm_eps = self.hparams["layer_norm_epsilon"] rescale_every_n_layers = self.hparams["rescale_every"] intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32) + time_mix_extra_dim = 64 if hidden_size == 4096 else 32 + time_decay_extra_dim = 128 if hidden_size == 4096 else 64 # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) @@ -2766,6 +2768,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(layer_norm_eps) self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) self.gguf_writer.add_feed_forward_length(intermediate_size) self.gguf_writer.add_file_type(self.ftype) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ebeb200aae961..a48c4fb676a46 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -95,6 +95,8 @@ class LLM: ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" + TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" + TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" class Attention: HEAD_COUNT = "{arch}.attention.head_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 0388db567d8bd..3c95c26730f7a 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -673,6 +673,12 @@ def add_expert_weights_scale(self, value: float) -> None: def add_rescale_every_n_layers(self, count: int) -> None: self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count) + def add_time_mix_extra_dim(self, dim: int) -> None: + self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim) + + def add_time_decay_extra_dim(self, dim: int) -> None: + self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim) + def add_wkv_head_size(self, size: int) -> None: self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) diff --git a/src/llama.cpp b/src/llama.cpp index a0b9dbc87c742..62e8c0c34be29 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -298,6 +298,8 @@ enum llm_kv { LLM_KV_ATTN_LOGIT_SOFTCAPPING, LLM_KV_FINAL_LOGIT_SOFTCAPPING, LLM_KV_RESCALE_EVERY_N_LAYERS, + LLM_KV_TIME_MIX_EXTRA_DIM, + LLM_KV_TIME_DECAY_EXTRA_DIM, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -400,6 +402,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, + { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" }, + { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -2296,6 +2300,8 @@ struct llama_hparams { // for RWKV uint32_t rescale_every_n_layers = 0; + uint32_t time_mix_extra_dim = 0; + uint32_t time_decay_extra_dim = 0; uint32_t wkv_head_size = 0; float rope_attn_factor = 1.0f; @@ -2362,6 +2368,8 @@ struct llama_hparams { if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true; if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true; + if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true; + if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true; if (this->wkv_head_size != other.wkv_head_size) return true; if (this->dec_start_token_id != other.dec_start_token_id) return true; @@ -5909,6 +5917,8 @@ static void llm_load_hparams( { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size); + ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim); + ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim); ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); switch (hparams.n_layer) { @@ -8365,8 +8375,8 @@ static bool llm_load_tensors( model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); - const int time_mix_extra_dim = (n_embd == 4096) ? 64 : 32; - const int time_decay_extra_dim = (n_embd == 4096) ? 128 : 64; + const int time_mix_extra_dim = hparams.time_mix_extra_dim; + const int time_decay_extra_dim = hparams.time_decay_extra_dim; const int head_size = hparams.wkv_head_size; const int attn_hidden_size = n_embd; const int ffn_size = hparams.n_ff_arr[0]; From 601b5920c6aa839500a6d7580589a91c24279326 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 26 Aug 2024 09:31:21 +0800 Subject: [PATCH 45/53] converter: Match ``new_name`` instead of ``name`` for float32 explicit tensors Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a39f2dc7398a0..7374cb25e39ce 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -302,7 +302,7 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.TIME_MIX_FIRST, ) ) - or not name.endswith(".weight") + or not new_name.endswith(".weight") ): data_qtype = gguf.GGMLQuantizationType.F32 From e0ea51144ef9c5be2eafee5275ce37f8b414615b Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 26 Aug 2024 09:32:16 +0800 Subject: [PATCH 46/53] llama: rwkv6: Keep ``time_mix_w1/w2`` as F32 Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 2 ++ src/llama.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7374cb25e39ce..27ac34b810acd 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -300,6 +300,8 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.TOKEN_TYPES, gguf.MODEL_TENSOR.SSM_CONV1D, gguf.MODEL_TENSOR.TIME_MIX_FIRST, + gguf.MODEL_TENSOR.TIME_MIX_W1, + gguf.MODEL_TENSOR.TIME_MIX_W2, ) ) or not new_name.endswith(".weight") diff --git a/src/llama.cpp b/src/llama.cpp index 62e8c0c34be29..e437d265e4595 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17472,6 +17472,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // do not quantize RWKV's time_mix_first tensors quantize &= name.find("time_mix_first.weight") == std::string::npos; + quantize &= name.find("time_mix_w1.weight") == std::string::npos; + quantize &= name.find("time_mix_w2.weight") == std::string::npos; // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos; From 5f00c52be05dfc058e9074999fc249fcde737542 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 26 Aug 2024 09:50:51 +0800 Subject: [PATCH 47/53] llama: rwkv6: Remove unused nodes Signed-off-by: Molly Sophia --- src/llama.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index e437d265e4595..fc682ba36abf2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15135,15 +15135,6 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, llm_build_channel_mix_rwkv6(ctx0, layer, x_norm_ffn, x_prev)); ggml_build_forward_expand(gf, cur); - token_shift = ggml_cont( - ctx0, - ggml_permute( - ctx0, - ggml_reshape_3d(ctx0, token_shift, n_embd, n_seqs, 2), - 0, 2, 1, 3 - ) - ); - struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn)); From 7444046c47ae92c8561107aa82068c4e080e0e24 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 26 Aug 2024 09:52:11 +0800 Subject: [PATCH 48/53] llama: rwkv6: Apply code format changes Signed-off-by: Molly Sophia --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index fc682ba36abf2..9bf8d65f3ed12 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15125,7 +15125,7 @@ struct llm_build_context { ) ); - ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il); + struct ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il); x_prev = ggml_concat( ctx0, ffn_shift, @@ -15161,7 +15161,7 @@ struct llm_build_context { } cur = inpL; - ggml_tensor * inp_out_ids = build_inp_out_ids(); + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); cur = ggml_get_rows(ctx0, cur, inp_out_ids); From 7f2ef566398ceda50112ffb447772fbaea8e77ca Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Fri, 30 Aug 2024 12:11:31 +0800 Subject: [PATCH 49/53] llama: rwkv6: Add lora for some supported tensors Currently att.key/receptance/value/gate/output, ffn.receptance/key/value, as well as head.weight Signed-off-by: Molly Sophia --- src/llama.cpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 9bf8d65f3ed12..ef6c4632bbdf8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9384,6 +9384,7 @@ static struct ggml_tensor * llm_build_mamba( } static struct ggml_tensor * llm_build_time_mix_rwkv6( + struct llama_context & lctx, struct ggml_context * ctx, const struct llama_layer * layer, struct ggml_tensor * cur, @@ -9481,12 +9482,12 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( cur ); - struct ggml_tensor * r = ggml_reshape_4d(ctx, ggml_mul_mat(ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens); - struct ggml_tensor * k = ggml_reshape_4d(ctx, ggml_mul_mat(ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens); - struct ggml_tensor * v = ggml_reshape_4d(ctx, ggml_mul_mat(ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens); + struct ggml_tensor * r = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens); + struct ggml_tensor * k = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens); + struct ggml_tensor * v = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens); struct ggml_tensor * g = ggml_silu( ctx, - ggml_mul_mat(ctx, layer->time_mix_gate, xg) + llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg) ); struct ggml_tensor * w = ggml_mul_mat( @@ -9516,12 +9517,13 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); cur = ggml_mul(ctx, cur, g); - cur = ggml_mul_mat(ctx, layer->time_mix_output, cur); + cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur); return ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs); } static struct ggml_tensor * llm_build_channel_mix_rwkv6( + struct llama_context & lctx, struct ggml_context * ctx, const struct llama_layer * layer, struct ggml_tensor * cur, @@ -9530,15 +9532,15 @@ static struct ggml_tensor * llm_build_channel_mix_rwkv6( struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur); struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur); - struct ggml_tensor * r = ggml_sigmoid(ctx, ggml_mul_mat(ctx, layer->channel_mix_receptance, xr)); + struct ggml_tensor * r = ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr)); struct ggml_tensor * k = ggml_sqr( ctx, ggml_relu( ctx, - ggml_mul_mat(ctx, layer->channel_mix_key, xk) + llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk) ) ); - return ggml_mul(ctx, r, ggml_mul_mat(ctx, layer->channel_mix_value, k)); + return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k)); } struct llm_build_context { @@ -15109,7 +15111,7 @@ struct llm_build_context { 1 ); - cur = ggml_add(ctx0, cur, llm_build_time_mix_rwkv6(ctx0, layer, x_norm_att, x_prev, &wkv_states)); + cur = ggml_add(ctx0, cur, llm_build_time_mix_rwkv6(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states)); ggml_build_forward_expand(gf, cur); ggml_build_forward_expand( gf, @@ -15132,7 +15134,7 @@ struct llm_build_context { ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0), 1 ); - cur = ggml_add(ctx0, cur, llm_build_channel_mix_rwkv6(ctx0, layer, x_norm_ffn, x_prev)); + cur = ggml_add(ctx0, cur, llm_build_channel_mix_rwkv6(lctx, ctx0, layer, x_norm_ffn, x_prev)); ggml_build_forward_expand(gf, cur); struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); @@ -15166,7 +15168,7 @@ struct llm_build_context { cur = ggml_get_rows(ctx0, cur, inp_out_ids); cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); From 7004323ecdd5f4dab77e626ea0e677fcf175542e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 30 Aug 2024 13:19:14 +0300 Subject: [PATCH 50/53] rwkv : speed-up tokenization using trie --- src/llama-vocab.cpp | 64 +++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 9be52d7372c21..12fbb59714862 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -58,17 +58,17 @@ struct naive_trie { auto res = children.find(c); if (res != children.end()) { return res->second.get_longest_prefix(key, len, offset + 1); - } else { - return std::make_pair(key, offset); } + + return std::make_pair(key, offset); } - struct naive_trie * traverse(const char c) { + const struct naive_trie * traverse(const char c) const { auto res = children.find(c); if (res != children.end()) { return &res->second; - } else { - return NULL; } + + return NULL; } std::map children; bool has_value; @@ -843,7 +843,7 @@ struct llm_tokenizer_ugm { // traverse the token matcher trie to find a matching token bool single_codepoint_token_found = false; const struct best_tokenization & current_best = tokenization_results[input_offset]; - struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]); + const struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]); while (prefix_offset <= input_len && node != NULL) { // check if we found valid token in prefix @@ -1103,6 +1103,7 @@ struct llm_tokenizer_ugm { static std::vector llama_unescape_rwkv_token(const std::string & escaped) { std::vector output; + output.reserve(escaped.size()); // Parser state bool escaping = false; @@ -1158,9 +1159,12 @@ struct llm_tokenizer_rwkv { llm_tokenizer_rwkv(const llama_vocab & vocab): vocab(vocab) { // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens. // For now, we decode the vocab here into the lookup we'll use for tokenization. - for (const auto & token : vocab.id_to_token) { - auto data = llama_unescape_rwkv_token(token.text); - tokens.push_back(data); + + // build trie + for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) { + const auto & token = vocab.id_to_token[id]; + const auto data = llama_unescape_rwkv_token(token.text); + token_matcher.insert((const char *) data.data(), data.size(), id); } } @@ -1168,36 +1172,34 @@ struct llm_tokenizer_rwkv { uint32_t position = 0; while (position < text.size()) { - // Iterate through possible tokens backwards, starting with the largest - for (int32_t i = (int32_t)tokens.size() - 1; i >= 0; i--) { - // Skip tokens that aren't normal type, we can't match on those - if (!(vocab.id_to_token[i].attr & LLAMA_TOKEN_ATTR_NORMAL)) { - continue; - } - - uint32_t token_size = tokens[i].size(); - - // If there's not enough left for this token - if (text.size() - position < token_size) { - continue; - } + const struct naive_trie * node = token_matcher.traverse(text[position]); + if (node == NULL) { + // no matching token found, add unknown token + output.push_back(vocab.special_unk_id); + position += 1; + continue; + } - // If the token doesn't match the data - if (std::memcmp(text.data() + position, tokens[i].data(), token_size) != 0) { - continue; + // traverse the trie to find the longest matching token + uint32_t token_id = 0; + uint32_t token_length = 0; + while (node != NULL) { + if (node->has_value) { + token_id = node->value; + token_length = position + 1; } - - // Add the token and advance - output.push_back(i); - position += token_size; - break; + node = node->traverse(text[++position]); } + + // add the longest matching token + output.push_back(token_id); + position = token_length; } } const llama_vocab & vocab; - std::vector> tokens; + struct naive_trie token_matcher; }; // From 59dc2e7099a2aa8ba13ce2dc4ba7aa21b2490010 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 30 Aug 2024 13:30:52 +0300 Subject: [PATCH 51/53] minor : style + indentation --- ggml/include/ggml.h | 12 ++++++------ src/llama.cpp | 43 ++++++++++++++++++++++++------------------- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 39aff9e39a68a..8bf39cb5d9d62 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1899,12 +1899,12 @@ extern "C" { GGML_API struct ggml_tensor * ggml_rwkv_wkv( struct ggml_context * ctx, - struct ggml_tensor * k, - struct ggml_tensor * v, - struct ggml_tensor * r, - struct ggml_tensor * tf, - struct ggml_tensor * td, - struct ggml_tensor * state); + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * r, + struct ggml_tensor * tf, + struct ggml_tensor * td, + struct ggml_tensor * state); // custom operators diff --git a/src/llama.cpp b/src/llama.cpp index ef6c4632bbdf8..6c374277d4138 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9383,24 +9383,25 @@ static struct ggml_tensor * llm_build_mamba( return cur; } -static struct ggml_tensor * llm_build_time_mix_rwkv6( - struct llama_context & lctx, - struct ggml_context * ctx, - const struct llama_layer * layer, - struct ggml_tensor * cur, - struct ggml_tensor * x_prev, - struct ggml_tensor ** wkv_state) { - size_t n_embed = cur->ne[0]; +static struct ggml_tensor * llm_build_rwkv6_time_mix( + struct llama_context & lctx, + struct ggml_context * ctx, + const struct llama_layer * layer, + struct ggml_tensor * cur, + struct ggml_tensor * x_prev, + struct ggml_tensor ** wkv_state) { + size_t n_embed = cur->ne[0]; size_t n_seq_tokens = cur->ne[1]; - size_t n_seqs = cur->ne[2]; - size_t head_size = layer->time_mix_first->ne[0]; + size_t n_seqs = cur->ne[2]; + + size_t head_size = layer->time_mix_first->ne[0]; size_t head_count = layer->time_mix_first->ne[1]; size_t n_tokens = n_seqs * n_seq_tokens; struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); - sx = ggml_reshape_2d(ctx, sx, n_embed, n_tokens); + sx = ggml_reshape_2d(ctx, sx, n_embed, n_tokens); cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur); @@ -9498,6 +9499,7 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw) ) ); + w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed)); w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w))); w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens); @@ -9505,6 +9507,7 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( k = ggml_transpose(ctx, k); v = ggml_transpose(ctx, v); r = ggml_transpose(ctx, r); + struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); cur = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0); *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float)); @@ -9512,6 +9515,7 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( // group norm with head_count groups cur = ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens); cur = ggml_norm(ctx, cur, 64e-5f); + // Convert back to regular vectors. cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); @@ -9522,12 +9526,12 @@ static struct ggml_tensor * llm_build_time_mix_rwkv6( return ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs); } -static struct ggml_tensor * llm_build_channel_mix_rwkv6( - struct llama_context & lctx, - struct ggml_context * ctx, - const struct llama_layer * layer, - struct ggml_tensor * cur, - struct ggml_tensor * x_prev) { +static struct ggml_tensor * llm_build_rwkv6_channel_mix( + struct llama_context & lctx, + struct ggml_context * ctx, + const struct llama_layer * layer, + struct ggml_tensor * cur, + struct ggml_tensor * x_prev) { struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur); struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur); @@ -9540,6 +9544,7 @@ static struct ggml_tensor * llm_build_channel_mix_rwkv6( llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk) ) ); + return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k)); } @@ -15111,7 +15116,7 @@ struct llm_build_context { 1 ); - cur = ggml_add(ctx0, cur, llm_build_time_mix_rwkv6(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states)); + cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states)); ggml_build_forward_expand(gf, cur); ggml_build_forward_expand( gf, @@ -15134,7 +15139,7 @@ struct llm_build_context { ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0), 1 ); - cur = ggml_add(ctx0, cur, llm_build_channel_mix_rwkv6(lctx, ctx0, layer, x_norm_ffn, x_prev)); + cur = ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev)); ggml_build_forward_expand(gf, cur); struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); From 51753757151f7f7084ed223153ee40daa3c598ea Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sat, 31 Aug 2024 11:59:30 +0800 Subject: [PATCH 52/53] llama: rwkv6: Avoid division by zero Co-authored-by: compilade --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 6c374277d4138..8249756e9bfb7 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15156,7 +15156,7 @@ struct llm_build_context { ) ); - if ((il + 1) % hparams.rescale_every_n_layers == 0) { + if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { cur = ggml_scale(ctx0, cur, 0.5F); } From 846358d358643dd0cc1ca1ea3d8a9c42b5086116 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sat, 31 Aug 2024 12:17:08 +0800 Subject: [PATCH 53/53] ggml: rwkv_wkv: Avoid copying the state Signed-off-by: Molly Sophia --- ggml/src/ggml.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 93f3933e75127..faf15170c74cf 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -16874,7 +16874,6 @@ static void ggml_compute_forward_rwkv_wkv_f32( float * r = (float *) dst->src[2]->data; float * time_faaaa = (float *) dst->src[3]->data; float * time_decay = (float *) dst->src[4]->data; - memcpy(state, dst->src[5]->data, (C / H) * C * n_seqs * sizeof(float)); size_t t_stride = H * (C / H); @@ -16887,7 +16886,9 @@ static void ggml_compute_forward_rwkv_wkv_f32( // recursive through each token for (size_t t = 0; t < T; t++) { size_t t_offset = t * t_stride; - float * state_cur = state + (C / H) * C * (t / (T / n_seqs)); + size_t state_offset = (C / H) * C * (t / (T / n_seqs)); + float * state_cur = state + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset; for (size_t h = 0; h < H; h++) { size_t h_offset = h * h_stride; @@ -16911,7 +16912,7 @@ static void ggml_compute_forward_rwkv_wkv_f32( float v_val = v[t_h_j_offset]; float kv_val = v_val * k_val; - float prev_state_val = state_cur[h_2d_i_j_offset]; + float prev_state_val = state_prev[h_2d_i_j_offset]; float temp_val = kv_val * time_faaaa_val + prev_state_val; dst_data[t_h_j_offset] += temp_val * r_val; state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val;