intel-analytics · lalalapotter · Jul 11, 2024 · Jul 10, 2024 · Jul 11, 2024 · qiyuangong
diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
@@ -327,9 +327,11 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
             optimize_lm_head = False
             if is_lm_head(name, model_config, out_features):
                 model_type = getattr(model_config, "model_type", None)
-                if model_type in ["gptj", "llama", "qwen2"] and \
-                        os.environ.get("IPEX_LLM_LAST_LM_HEAD", None) == "1":
-                    optimize_lm_head = True
+                if model_type in ["gptj", "llama", "qwen2"]:
+                    if os.environ.get("IPEX_LLM_LAST_LM_HEAD", None) is not None:
+                        optimize_lm_head = os.environ.get("IPEX_LLM_LAST_LM_HEAD", None) == "1"
+                    elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
+                        optimize_lm_head = os.environ.get("IPEX_LLM_LOW_MEM", None) == "1"
 if shape[1] > 10: 
     shape[1] = 1 
     x = x[:, -1, :].view(shape) 
 return x 
 if shape[1] > 10: 
     shape[1] = 1 
     x = x[:, -1, :].view(shape) 
 return x 
             with init_empty_weights():
                 new_linear = None
                 is_gptq = is_gptq_linear(module)

diff --git a/python/llm/src/ipex_llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py
@@ -286,6 +286,8 @@ def should_split_qkv_tensor(query_states, bsz, num_heads, q_len, kv_seq_len, out
     if not output_attentions:
         if os.environ.get("IPEX_LLM_SPLIT_QKV", None) is not None:
             return os.environ.get("IPEX_LLM_SPLIT_QKV", None) == "1"
+        elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
+            return os.environ.get("IPEX_LLM_LOW_MEM", None) == "1"
         elif query_states.dtype == torch.float16 and \
                 query_states.shape[2] >= 6800:
             # split tensor for memory block limitation

diff --git a/python/llm/src/ipex_llm/transformers/models/mistral.py b/python/llm/src/ipex_llm/transformers/models/mistral.py
@@ -92,6 +92,8 @@ def should_split_qkv_tensor(query_states, bsz, num_heads, q_len, kv_seq_len, out
     if not output_attentions:
         if os.environ.get("IPEX_LLM_SPLIT_QKV", None) is not None:
             return os.environ.get("IPEX_LLM_SPLIT_QKV", None) == "1"
+        elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
+            return os.environ.get("IPEX_LLM_LOW_MEM", None) == "1"
         elif query_states.dtype == torch.float16 and \
                 query_states.shape[2] >= 6300:
             # split tensor for memory block limitation