From 59a08be755cc64bdb6db9bcb3c8f9d6da6d8846b Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Aug 2024 11:25:08 +0000 Subject: [PATCH 01/18] feat: initial support for llama.cpp --- README.md | 1 + convert_hf_to_gguf.py | 34 ++++++++++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/gguf_writer.py | 3 +++ src/llama.cpp | 15 +++++++++++++++ 5 files changed, 55 insertions(+) diff --git a/README.md b/README.md index e76ba921853c0..911daa40e9311 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,7 @@ Typically finetunes of the base models below are supported as well. - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) +- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6a1a3a937febd..e7dae3796bcda 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2756,6 +2756,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_mamba_b_dt_rms(False) # For classic Mamba we don't apply rms norm on B / DT layers self.gguf_writer.add_file_type(self.ftype) _tok_embd = None @@ -3854,6 +3855,39 @@ def prepare_tensors(self): self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) super().prepare_tensors() + + +@Model.register("FalconMambaForCausalLM") +class FalconMambaModel(MambaModel): + model_arch = gguf.MODEL_ARCH.MAMBA + + def set_gguf_parameters(self): + d_model = self.find_hparam(["hidden_size", "d_model"]) + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + + # Fail early for models which don't have a block expansion factor of 2 + assert d_inner == 2 * d_model + + self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default + self.gguf_writer.add_embedding_length(d_model) + self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading + self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_mamba_b_dt_rms(True) # For FalconMamba we do apply rms norm on B / DT layers + self.gguf_writer.add_ssm_inner_size(d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(dt_rank) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_file_type(self.ftype) + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5541972ce52b0..beb7d6f5fec3b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -130,6 +130,7 @@ class SSM: INNER_SIZE = "{arch}.ssm.inner_size" STATE_SIZE = "{arch}.ssm.state_size" TIME_STEP_RANK = "{arch}.ssm.time_step_rank" + B_DT_RMS = "{arch}.ssm.b_dt_rms" class Tokenizer: MODEL = "tokenizer.ggml.model" @@ -1372,6 +1373,7 @@ def get_type(val: Any) -> GGUFValueType: KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK +KEY_SSM_B_DT_RMS = Keys.SSM.B_DT_RMS # tokenization KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 76385a82872c9..41b443e539ace 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -714,6 +714,9 @@ def add_rope_scaling_orig_ctx_len(self, value: int) -> None: def add_rope_scaling_finetuned(self, value: bool) -> None: self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value) + + def add_mamba_b_dt_rms(self, value: bool) -> None: + self.add_bool(Keys.SSM.B_DT_RMS.format(arch=self.arch), value) def add_rope_scaling_yarn_log_mul(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value) diff --git a/src/llama.cpp b/src/llama.cpp index 5ab65ea97defa..52c8bff61dee4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -328,6 +328,7 @@ enum llm_kv { LLM_KV_SSM_CONV_KERNEL, LLM_KV_SSM_STATE_SIZE, LLM_KV_SSM_TIME_STEP_RANK, + LLM_KV_SSM_B_DT_RMS, LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_PRE, @@ -426,6 +427,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, + { LLM_KV_SSM_B_DT_RMS, "%s.ssm.b_dt_rms" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, @@ -2237,6 +2239,7 @@ struct llama_hparams { uint32_t ssm_d_inner = 0; uint32_t ssm_d_state = 0; uint32_t ssm_dt_rank = 0; + bool ssm_b_dt_rms = false; float f_clamp_kqv = 0.0f; float f_max_alibi_bias = 0.0f; @@ -5052,6 +5055,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + ml.get_key(LLM_KV_SSM_B_DT_RMS, hparams.ssm_b_dt_rms); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -12161,6 +12165,10 @@ struct llm_build_context { GGML_ASSERT(2 * d_model == d_inner); const int64_t d_state = hparams.ssm_d_state; const int64_t dt_rank = hparams.ssm_dt_rank; + // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers) + const bool ssm_b_dt_rms = hparams.ssm_b_dt_rms; + // Use the same RMS norm as the final layer norm + const float norm_rms_eps = hparams.f_norm_rms_eps; struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -12241,6 +12249,13 @@ struct llm_build_context { struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank); struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state)); + // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers + if (ssm_b_dt_rms) { + dt = ggml_rms_norm(ctx0, dt, norm_rms_eps); + B = ggml_rms_norm(ctx0, B, norm_rms_eps); + C = ggml_rms_norm(ctx0, C, norm_rms_eps); + } + // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens} dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt); dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); From bfa0286866885213793e04e3e280013f1fff5f7a Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Aug 2024 11:40:57 +0000 Subject: [PATCH 02/18] fix: lint --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e7dae3796bcda..aa23bf95c4602 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3793,7 +3793,7 @@ class ExaoneModel(Model): def set_gguf_parameters(self): hparams = self.hparams - assert(hparams["activation_function"] == "silu") + assert (hparams["activation_function"] == "silu") max_position_embeddings = hparams["max_position_embeddings"] embed_dim = hparams["hidden_size"] @@ -3855,7 +3855,7 @@ def prepare_tensors(self): self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) super().prepare_tensors() - + @Model.register("FalconMambaForCausalLM") class FalconMambaModel(MambaModel): From b97704c9a00b11936453201b29fa9675e0b964f9 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Aug 2024 12:02:36 +0000 Subject: [PATCH 03/18] refactor: better refactor --- convert_hf_to_gguf.py | 49 +++++++++---------------------------------- 1 file changed, 10 insertions(+), 39 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index aa23bf95c4602..7242ff9d558a5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2711,7 +2711,7 @@ class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 -@Model.register("MambaForCausalLM", "MambaLMHeadModel") +@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA @@ -2731,7 +2731,7 @@ def set_vocab(self): else: # Use the GPT-NeoX tokenizer when no tokenizer files are present self._set_vocab_builtin("gpt-neox", vocab_size) - + def set_gguf_parameters(self): d_model = self.find_hparam(["hidden_size", "d_model"]) d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 @@ -2742,7 +2742,11 @@ def set_gguf_parameters(self): # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - + num_hidden_layers = self.find_hparam(["n_layer", "num_hidden_layers"]) + use_b_dt_norm = False + # For falconmamba we do apply RMS norm on B / DT and C layers + if self.find_hparam(["model_type"]) in ["falcon_mamba"]: + use_b_dt_norm = True # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model @@ -2750,13 +2754,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading - self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_block_count(num_hidden_layers) self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_inner_size(d_inner) self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_mamba_b_dt_rms(False) # For classic Mamba we don't apply rms norm on B / DT layers + self.gguf_writer.add_mamba_b_dt_rms(use_b_dt_norm) # For classic Mamba we don't apply rms norm on B / DT layers self.gguf_writer.add_file_type(self.ftype) _tok_embd = None @@ -3855,43 +3859,10 @@ def prepare_tensors(self): self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) super().prepare_tensors() - - -@Model.register("FalconMambaForCausalLM") -class FalconMambaModel(MambaModel): - model_arch = gguf.MODEL_ARCH.MAMBA - - def set_gguf_parameters(self): - d_model = self.find_hparam(["hidden_size", "d_model"]) - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model - d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 - # ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - - # Fail early for models which don't have a block expansion factor of 2 - assert d_inner == 2 * d_model - - self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default - self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading - self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_ssm_conv_kernel(d_conv) - self.gguf_writer.add_mamba_b_dt_rms(True) # For FalconMamba we do apply rms norm on B / DT layers - self.gguf_writer.add_ssm_inner_size(d_inner) - self.gguf_writer.add_ssm_state_size(d_state) - self.gguf_writer.add_ssm_time_step_rank(dt_rank) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_file_type(self.ftype) - + ###### CONVERSION LOGIC ###### - # tree of lazy tensors class LazyTorchTensor(gguf.LazyBase): _tensor_type = torch.Tensor From 343b5836c1906cf2230cb271139edac122ebc317 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Sun, 18 Aug 2024 18:11:21 +0400 Subject: [PATCH 04/18] Update src/llama.cpp Co-authored-by: compilade --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 52c8bff61dee4..266a5576a9b58 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5055,7 +5055,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); - ml.get_key(LLM_KV_SSM_B_DT_RMS, hparams.ssm_b_dt_rms); + ml.get_key(LLM_KV_SSM_B_DT_RMS, hparams.ssm_b_dt_rms, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); From a8109e352e177870bf8aa2dde2be619e9a32fad1 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Sun, 18 Aug 2024 18:12:04 +0400 Subject: [PATCH 05/18] Update src/llama.cpp Co-authored-by: compilade --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 266a5576a9b58..b7eb89ee540c8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -427,7 +427,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, - { LLM_KV_SSM_B_DT_RMS, "%s.ssm.b_dt_rms" }, + { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, From 184a4c676fedf461e1fbf616a396b267aee18e2a Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Aug 2024 14:14:17 +0000 Subject: [PATCH 06/18] fix: address comments --- convert_hf_to_gguf.py | 7 +++---- gguf-py/gguf/constants.py | 4 ++-- gguf-py/gguf/gguf_writer.py | 4 ++-- src/llama.cpp | 10 +++++----- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7242ff9d558a5..ab2e358481c14 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2741,8 +2741,7 @@ def set_gguf_parameters(self): # ref: https://stackoverflow.com/a/17511341/22827863 # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - num_hidden_layers = self.find_hparam(["n_layer", "num_hidden_layers"]) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 use_b_dt_norm = False # For falconmamba we do apply RMS norm on B / DT and C layers if self.find_hparam(["model_type"]) in ["falcon_mamba"]: @@ -2754,13 +2753,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading - self.gguf_writer.add_block_count(num_hidden_layers) + self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_inner_size(d_inner) self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_mamba_b_dt_rms(use_b_dt_norm) # For classic Mamba we don't apply rms norm on B / DT layers + self.gguf_writer.add_mamba_dt_b_c_rms(use_b_dt_norm) # For classic Mamba we don't apply rms norm on B / DT layers self.gguf_writer.add_file_type(self.ftype) _tok_embd = None diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index beb7d6f5fec3b..92e8fd066c41b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -130,7 +130,7 @@ class SSM: INNER_SIZE = "{arch}.ssm.inner_size" STATE_SIZE = "{arch}.ssm.state_size" TIME_STEP_RANK = "{arch}.ssm.time_step_rank" - B_DT_RMS = "{arch}.ssm.b_dt_rms" + DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" class Tokenizer: MODEL = "tokenizer.ggml.model" @@ -1373,7 +1373,7 @@ def get_type(val: Any) -> GGUFValueType: KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK -KEY_SSM_B_DT_RMS = Keys.SSM.B_DT_RMS +KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS # tokenization KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 41b443e539ace..331d8af96f956 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -715,8 +715,8 @@ def add_rope_scaling_orig_ctx_len(self, value: int) -> None: def add_rope_scaling_finetuned(self, value: bool) -> None: self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value) - def add_mamba_b_dt_rms(self, value: bool) -> None: - self.add_bool(Keys.SSM.B_DT_RMS.format(arch=self.arch), value) + def add_mamba_dt_b_c_rms(self, value: bool) -> None: + self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value) def add_rope_scaling_yarn_log_mul(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value) diff --git a/src/llama.cpp b/src/llama.cpp index b7eb89ee540c8..301c3933aa71c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -328,7 +328,7 @@ enum llm_kv { LLM_KV_SSM_CONV_KERNEL, LLM_KV_SSM_STATE_SIZE, LLM_KV_SSM_TIME_STEP_RANK, - LLM_KV_SSM_B_DT_RMS, + LLM_KV_SSM_DT_B_C_RMS, LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_PRE, @@ -2239,7 +2239,7 @@ struct llama_hparams { uint32_t ssm_d_inner = 0; uint32_t ssm_d_state = 0; uint32_t ssm_dt_rank = 0; - bool ssm_b_dt_rms = false; + bool ssm_dt_b_c_rms = false; float f_clamp_kqv = 0.0f; float f_max_alibi_bias = 0.0f; @@ -5055,7 +5055,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); - ml.get_key(LLM_KV_SSM_B_DT_RMS, hparams.ssm_b_dt_rms, false); + ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -12166,7 +12166,7 @@ struct llm_build_context { const int64_t d_state = hparams.ssm_d_state; const int64_t dt_rank = hparams.ssm_dt_rank; // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers) - const bool ssm_b_dt_rms = hparams.ssm_b_dt_rms; + const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms; // Use the same RMS norm as the final layer norm const float norm_rms_eps = hparams.f_norm_rms_eps; @@ -12250,7 +12250,7 @@ struct llm_build_context { struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state)); // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers - if (ssm_b_dt_rms) { + if (ssm_dt_b_c_rms) { dt = ggml_rms_norm(ctx0, dt, norm_rms_eps); B = ggml_rms_norm(ctx0, B, norm_rms_eps); C = ggml_rms_norm(ctx0, C, norm_rms_eps); From 349426546bd30744acb7fda011f5b3f4b9821631 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Sun, 18 Aug 2024 18:15:40 +0400 Subject: [PATCH 07/18] Update convert_hf_to_gguf.py Co-authored-by: compilade --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ab2e358481c14..b429fdb83d4ef 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2744,7 +2744,7 @@ def set_gguf_parameters(self): rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 use_b_dt_norm = False # For falconmamba we do apply RMS norm on B / DT and C layers - if self.find_hparam(["model_type"]) in ["falcon_mamba"]: + if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): use_b_dt_norm = True # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model From f7d2e9105f069d1d1f0e90e2ac162696726a829f Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Aug 2024 14:17:33 +0000 Subject: [PATCH 08/18] fix: add more cleanup and harmonization --- convert_hf_to_gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b429fdb83d4ef..7d1d2121faa70 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2742,10 +2742,10 @@ def set_gguf_parameters(self): # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - use_b_dt_norm = False + use_dt_b_c_norm = False # For falconmamba we do apply RMS norm on B / DT and C layers if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): - use_b_dt_norm = True + use_dt_b_c_norm = True # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model @@ -2759,7 +2759,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_mamba_dt_b_c_rms(use_b_dt_norm) # For classic Mamba we don't apply rms norm on B / DT layers + self.gguf_writer.add_mamba_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers self.gguf_writer.add_file_type(self.ftype) _tok_embd = None From 9e22bb7ed6b2b7f597b965328ff483c562a1eed5 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Aug 2024 14:20:37 +0000 Subject: [PATCH 09/18] fix: lint --- convert_hf_to_gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7d1d2121faa70..2efa9f1180ac4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2731,7 +2731,7 @@ def set_vocab(self): else: # Use the GPT-NeoX tokenizer when no tokenizer files are present self._set_vocab_builtin("gpt-neox", vocab_size) - + def set_gguf_parameters(self): d_model = self.find_hparam(["hidden_size", "d_model"]) d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 @@ -2741,7 +2741,7 @@ def set_gguf_parameters(self): # ref: https://stackoverflow.com/a/17511341/22827863 # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 use_dt_b_c_norm = False # For falconmamba we do apply RMS norm on B / DT and C layers if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): @@ -3858,7 +3858,7 @@ def prepare_tensors(self): self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) super().prepare_tensors() - + ###### CONVERSION LOGIC ###### From 4553502d4a101b90dcf7c26bc9e018b7be4b4fae Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Sun, 18 Aug 2024 18:41:42 +0400 Subject: [PATCH 10/18] Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 331d8af96f956..1aedf4c80c8a3 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -715,7 +715,7 @@ def add_rope_scaling_orig_ctx_len(self, value: int) -> None: def add_rope_scaling_finetuned(self, value: bool) -> None: self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value) - def add_mamba_dt_b_c_rms(self, value: bool) -> None: + def add_ssm_dt_b_c_rms(self, value: bool) -> None: self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value) def add_rope_scaling_yarn_log_mul(self, value: float) -> None: From d637bb97bc9186b056959652110ebb6244f74033 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Aug 2024 14:42:40 +0000 Subject: [PATCH 11/18] fix: change name --- convert_hf_to_gguf.py | 2 +- gguf-py/gguf/gguf_writer.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2efa9f1180ac4..4b843991fc905 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2759,7 +2759,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_mamba_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers + self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers self.gguf_writer.add_file_type(self.ftype) _tok_embd = None diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 1aedf4c80c8a3..af3b98c679b0b 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -714,9 +714,6 @@ def add_rope_scaling_orig_ctx_len(self, value: int) -> None: def add_rope_scaling_finetuned(self, value: bool) -> None: self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value) - - def add_ssm_dt_b_c_rms(self, value: bool) -> None: - self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value) def add_rope_scaling_yarn_log_mul(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value) @@ -733,6 +730,9 @@ def add_ssm_state_size(self, value: int) -> None: def add_ssm_time_step_rank(self, value: int) -> None: self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value) + def add_ssm_dt_b_c_rms(self, value: bool) -> None: + self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value) + def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) From 57c3eb41154a68b922c42187191c28495553fe4f Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Sun, 18 Aug 2024 19:35:57 +0400 Subject: [PATCH 12/18] Apply suggestions from code review Co-authored-by: compilade --- gguf-py/gguf/constants.py | 4 ++-- src/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 92e8fd066c41b..b55effa9907b1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -130,7 +130,7 @@ class SSM: INNER_SIZE = "{arch}.ssm.inner_size" STATE_SIZE = "{arch}.ssm.state_size" TIME_STEP_RANK = "{arch}.ssm.time_step_rank" - DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" + DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" class Tokenizer: MODEL = "tokenizer.ggml.model" @@ -1373,7 +1373,7 @@ def get_type(val: Any) -> GGUFValueType: KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK -KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS +KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS # tokenization KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL diff --git a/src/llama.cpp b/src/llama.cpp index 301c3933aa71c..13222f53abada 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -427,7 +427,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, - { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, + { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, From bf5e344056cfaaa02d1c9a87b517846ee10d3c3c Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Aug 2024 15:37:42 +0000 Subject: [PATCH 13/18] add in operator --- src/llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama.cpp b/src/llama.cpp index 13222f53abada..c1e3c8f246137 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2289,6 +2289,7 @@ struct llama_hparams { if (this->ssm_d_inner != other.ssm_d_inner) return true; if (this->ssm_d_state != other.ssm_d_state) return true; if (this->ssm_dt_rank != other.ssm_dt_rank) return true; + if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true; if (this->dec_start_token_id != other.dec_start_token_id) return true; From ca4db9e551d3a4b9a39acbac58dd5a24e34a06d8 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Aug 2024 15:45:24 +0000 Subject: [PATCH 14/18] fix: add `dt_b_c_rms` in `llm_load_print_meta` --- src/llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama.cpp b/src/llama.cpp index c1e3c8f246137..2021b3dfda4d6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5912,6 +5912,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank); + LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %u\n", __func__, hparams.ssm_dt_b_c_rms); } LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); From 78ad84f0fffdb4fadf85613598f404d4b0b56320 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Aug 2024 15:48:10 +0000 Subject: [PATCH 15/18] fix: correct printf format for bool --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 2021b3dfda4d6..43a78cbd62346 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5912,7 +5912,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank); - LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %u\n", __func__, hparams.ssm_dt_b_c_rms); + LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %s\n", __func__, hparams.ssm_dt_b_c_rms); } LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); From 7aeccbb7ddd0cbf349140fe7d54917d8981edde4 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Aug 2024 16:00:18 +0000 Subject: [PATCH 16/18] fix: correct print format --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 43a78cbd62346..7c2748cda94e0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5912,7 +5912,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank); - LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %s\n", __func__, hparams.ssm_dt_b_c_rms); + LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms); } LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); From 5c0f108e1514a26630349312ffc011865622fd71 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Sun, 18 Aug 2024 20:12:26 +0400 Subject: [PATCH 17/18] Update src/llama.cpp Co-authored-by: compilade --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 7c2748cda94e0..84fe4967d5c51 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5912,7 +5912,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank); - LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms); + LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms); } LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); From 3491291a32d4a7edb332ebbfb93d872dc1e996dc Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Mon, 19 Aug 2024 12:44:35 -0400 Subject: [PATCH 18/18] llama : quantize more Mamba tensors * llama : use f16 as the fallback of fallback quant types --- convert_hf_to_gguf.py | 18 +----------------- src/llama.cpp | 5 +++-- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4b843991fc905..108c822cff5d2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -295,6 +295,7 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.FFN_GATE_INP, gguf.MODEL_TENSOR.POS_EMBD, gguf.MODEL_TENSOR.TOKEN_TYPES, + gguf.MODEL_TENSOR.SSM_CONV1D, ) ) or not name.endswith(".weight") @@ -2786,23 +2787,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] - def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: - if bid is not None and new_name in ( - self.format_tensor_name( - n, bid, ".weight" if name.endswith(".weight") else "" - ) - for n in [ - gguf.MODEL_TENSOR.SSM_CONV1D, - gguf.MODEL_TENSOR.SSM_X, - gguf.MODEL_TENSOR.SSM_DT, - gguf.MODEL_TENSOR.SSM_A, - gguf.MODEL_TENSOR.SSM_D, - ] - ): - return gguf.GGMLQuantizationType.F32 - - return super().tensor_force_quant(name, new_name, bid, n_dims) - @Model.register("CohereForCausalLM") class CommandR2Model(Model): diff --git a/src/llama.cpp b/src/llama.cpp index 84fe4967d5c51..fe3c0db6f2931 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16122,6 +16122,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); } + if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { + new_type = GGML_TYPE_F16; + } LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); ++qs.n_fallback; } @@ -16450,8 +16453,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // do not quantize Mamba's small yet 2D weights // NOTE: can't use LLM_TN here because the layer number is not known quantize &= name.find("ssm_conv1d.weight") == std::string::npos; - quantize &= name.find("ssm_x.weight") == std::string::npos; - quantize &= name.find("ssm_dt.weight") == std::string::npos; // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos;