fix 7B-O

huggingface · molbap · Oct 1, 2024 · Oct 2, 2024 · Oct 4, 2024 · Oct 8, 2024
commit 00376c4d9914af4bbba787874044f1a938fc0a57
diff --git a/src/transformers/models/molmo/configuration_molmo.py b/src/transformers/models/molmo/configuration_molmo.py
@@ -300,6 +300,12 @@ class MolmoTextConfig(PretrainedConfig):
             The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        use_postnorm (`bool), *optional*, defaults to `True`):
+            Whther to apply pre or post layer normalization in each decoder layer.
+        use_attention_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply norm to keys and queries in the attention layer.
 
     ```python
     >>> from transformers import MolmoTextModel, MolmoTextConfig
@@ -338,6 +344,9 @@ def __init__(
         sliding_window=4096,
         max_window_layers=28,
         attention_dropout=0.0,
+        attention_bias=False,
+        use_postnorm=True,
+        use_attention_layer_norm=False,
         **kwargs,
     ):
         super().__init__(
@@ -354,6 +363,9 @@ def __init__(
         self.use_sliding_window = use_sliding_window
         self.sliding_window = sliding_window if use_sliding_window else None
         self.max_window_layers = max_window_layers
+        self.attention_bias = attention_bias
+        self.use_postnorm = use_postnorm
+        self.use_attention_layer_norm = use_attention_layer_norm
 
         # for backward compatibility
         if num_key_value_heads is None:

diff --git a/src/transformers/models/molmo/convert_molmo_weights_to_hf.py b/src/transformers/models/molmo/convert_molmo_weights_to_hf.py
@@ -67,7 +67,7 @@
     r"transformer.blocks.(\d+).(q|k)_norm.weight":                                 r"language_model.model.layers.\1.self_attn.\2_norm.layer.weight",
     r"transformer.blocks.(\d+).attn_norm.weight":                                  r"language_model.model.layers.\1.input_layernorm.weight",
     r"transformer.blocks.(\d+).attn_out.weight":                                   r"language_model.model.layers.\1.self_attn.o_proj.weight",
-    r"transformer.blocks.(\d+).ff_norm.weight":                                    r"language_model.model.layers.\1.post_attention_layernorm.layer.weight",
+    r"transformer.blocks.(\d+).ff_norm.weight":                                    r"language_model.model.layers.\1.post_attention_layernorm.weight",
     r"transformer.blocks.(\d+).ff_out.weight":                                     r"language_model.model.layers.\1.mlp.fc2.weight",
     r"transformer.blocks.(\d+).ff_proj.weight":                                    r"language_model.model.layers.\1.mlp.fc1.weight",
     r"transformer.ff_out.weight":                                                  r"language_model.lm_head.weight",
@@ -176,24 +176,13 @@ def write_model(
     if variant == "72B":
         pooling_config.text_intermediate_size = 59136
         pooling_config.text_hidden_size = 8192
-        text_config.qkv_bias = True
-        text_config.use_attention_layer_norm = False
-        text_config.use_post_attention_layernorm = True
-        text_config.use_post_mlp_layernorm = False
     elif variant == "7B-O":
         pooling_config.text_intermediate_size = 22016
         pooling_config.text_hidden_size = 4096
-        text_config.qkv_bias = original_config["qkv_bias"]
-        text_config.use_attention_layer_norm = original_config["attention_layer_norm"]
-        text_config.use_post_attention_layernorm = False
-        text_config.use_post_mlp_layernorm = True
-    elif variant == "7B-D":
-        text_config.qkv_bias = True
-        text_config.use_attention_layer_norm = False
-        text_config.use_post_attention_layernorm = True
-        text_config.use_post_mlp_layernorm = False
-
-    text_config.o_proj_bias = False
+
+    text_config.attention_bias = original_config["qkv_bias"]
+    text_config.use_postnorm = original_config["norm_after"]
+    text_config.use_attention_layer_norm = original_config["attention_layer_norm"]
 
     config = MolmoConfig(
         text_config=text_config.to_dict(),
@@ -221,9 +210,6 @@ def write_model(
     # Some post-processing of specific params.
     for old_key, new_key in new_keys.items():
         new_key = new_key.removeprefix("model.")
-        # remap keys
-        if "post_attention_layernorm" in new_key and variant == "7B-O":
-            new_key = new_key.replace("post_attention_layernorm", "post_mlp_layernorm")
         state_dict[new_key] = state_dict.pop(old_key)
         # Post-process the current_parameter.
 
@@ -293,9 +279,9 @@ def write_model(
     # ------------------------------------------------------------
     extra_special_tokens = {
         "image_token": "<image>",
-        "boi_token": "<im_patch>",
-        "eoi_token": "<im_start>",
-        "im_patch_token": "<im_end>",
+        "boi_token": "<im_start>",
+        "eoi_token": "<im_end>",
+        "im_patch_token": "<im_patch>",
         "im_col_token": "<im_col>",
     }
     if variant in ["7B-D", "72B"]:

diff --git a/src/transformers/models/molmo/modeling_molmo.py b/src/transformers/models/molmo/modeling_molmo.py
@@ -331,10 +331,10 @@ def __init__(self, config: MolmoTextConfig, layer_idx: Optional[int] = None):
                 f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                 f" and `num_heads`: {self.num_heads})."
             )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.qkv_bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.qkv_bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.qkv_bias)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.o_proj_bias)
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
 
         self.q_norm = ConditionalMolmoRMSNorm(
             hidden_size=self.hidden_size,
@@ -646,7 +646,7 @@ def forward(
 }
 
 
-class MolmoDecoderLayer(nn.Module):
+class MolmoPrenormDecoderLayer(nn.Module):
     def __init__(self, config, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -659,12 +659,7 @@ def __init__(self, config, layer_idx: int):
         self.self_attn = MOLMO_TEXT_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.mlp = MolmoMLP(config)
         self.input_layernorm = MolmoTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = ConditionalMolmoRMSNorm(
-            config.hidden_size, use_layer_norm=config.use_post_attention_layernorm, eps=config.rms_norm_eps
-        )
-        self.post_mlp_layernorm = ConditionalMolmoRMSNorm(
-            config.hidden_size, use_layer_norm=config.use_post_mlp_layernorm, eps=config.rms_norm_eps
-        )
+        self.post_attention_layernorm = MolmoTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -701,7 +696,6 @@ def forward(
         """
 
         residual = hidden_states
-
         hidden_states = self.input_layernorm(hidden_states)
 
         # Self Attention
@@ -721,7 +715,88 @@ def forward(
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = self.post_mlp_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class MolmoDecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        if config.sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        self.self_attn = MOLMO_TEXT_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = MolmoMLP(config)
+        self.input_layernorm = MolmoTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MolmoTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = residual + hidden_states
 
         outputs = (hidden_states,)
@@ -807,7 +882,7 @@ class MolmoTextPreTrainedModel(PreTrainedModel):
     config_class = MolmoTextConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["MolmoTextDecoderLayer"]
+    _no_split_modules = ["MolmoDecoderLayer", "MolmoPrenormDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
@@ -923,8 +998,9 @@ def __init__(self, config):
             config.hidden_size,
         )
 
+        decoder_layer = MolmoDecoderLayer if self.config.use_postnorm else MolmoPrenormDecoderLayer
         self.layers = nn.ModuleList(
-            [MolmoDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+            [decoder_layer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
         self.norm = MolmoTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)