Fix Llama 3.2 & 3.1 on LNL (#12196)

intel-analytics · Oct 14, 2024 · f8d1adc · f8d1adc
1 parent 516b578
commit f8d1adc
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 1 deletion.
diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
@@ -1268,7 +1268,7 @@ def _optimize_post(model, lightweight_bmm=False):
     from ipex_llm.transformers.models.llama import llama_mlp_forward
 
     if model.config.model_type == "llama" and model.config.rope_scaling is not None:
-        # llama 3.2
+        # llama 3.2 & llama 3.1
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
         from ipex_llm.transformers.models.common import rms_norm_forward
@@ -1279,6 +1279,7 @@ def _optimize_post(model, lightweight_bmm=False):
         convert_forward(model, module.LlamaMLP, mlp_silu_forward)
         convert_forward(model, module.LlamaModel, llama_model_forward)
         convert_forward(model, module.LlamaAttention, llama_attention_forward)
+        convert_forward(model, module.LlamaSdpaAttention, llama_attention_forward)
     elif model.config.model_type == "mllama":
         # llama 3.2 vision
         modeling_module_name = model.__class__.__module__

diff --git a/python/llm/src/ipex_llm/transformers/models/llama32.py b/python/llm/src/ipex_llm/transformers/models/llama32.py
@@ -204,6 +204,8 @@ def llama_attention_forward(
     kv_seq_len = key_states.size(2)
     if attention_mask is not None:  # no matter the length, we just slice it
         causal_mask = attention_mask[:, :, :, :kv_seq_len]
+    else:
+        causal_mask = None
 
     attn_weights = None
     if use_sdp(q_len, kv_seq_len, self.head_dim, query_states):