diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index b7899d73059..0a58449945c 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -1268,7 +1268,7 @@ def _optimize_post(model, lightweight_bmm=False): from ipex_llm.transformers.models.llama import llama_mlp_forward if model.config.model_type == "llama" and model.config.rope_scaling is not None: - # llama 3.2 + # llama 3.2 & llama 3.1 modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) from ipex_llm.transformers.models.common import rms_norm_forward @@ -1279,6 +1279,7 @@ def _optimize_post(model, lightweight_bmm=False): convert_forward(model, module.LlamaMLP, mlp_silu_forward) convert_forward(model, module.LlamaModel, llama_model_forward) convert_forward(model, module.LlamaAttention, llama_attention_forward) + convert_forward(model, module.LlamaSdpaAttention, llama_attention_forward) elif model.config.model_type == "mllama": # llama 3.2 vision modeling_module_name = model.__class__.__module__ diff --git a/python/llm/src/ipex_llm/transformers/models/llama32.py b/python/llm/src/ipex_llm/transformers/models/llama32.py index 9cb0f2c30a6..9bb1d97e266 100644 --- a/python/llm/src/ipex_llm/transformers/models/llama32.py +++ b/python/llm/src/ipex_llm/transformers/models/llama32.py @@ -204,6 +204,8 @@ def llama_attention_forward( kv_seq_len = key_states.size(2) if attention_mask is not None: # no matter the length, we just slice it causal_mask = attention_mask[:, :, :, :kv_seq_len] + else: + causal_mask = None attn_weights = None if use_sdp(q_len, kv_seq_len, self.head_dim, query_states):