From ddfb3cd430d47d3bf3184926b341f9a904f4b78e Mon Sep 17 00:00:00 2001 From: plusbang Date: Fri, 13 Dec 2024 11:12:05 +0800 Subject: [PATCH 1/2] add limitation --- python/llm/src/ipex_llm/transformers/npu_model.py | 8 +++++--- .../src/ipex_llm/transformers/npu_models/convert_mp.py | 9 ++++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 673a56ecfaa..b952b1aef8a 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -290,7 +290,8 @@ def optimize_npu_model(cls, *args, **kwargs): model.config.update({"group_size": quantization_group_size}) model.config.update({"asym": qtype == "asym_int4_rtn"}) optimize_llm_pre(model, qtype, mixed_precision, - quantization_group_size=quantization_group_size) + quantization_group_size=quantization_group_size, + max_prompt_len=max_prompt_len) cls.load_convert(qtype, model, "cpu", modules_to_not_convert, quantization_group_size, imatrix_data, *args, **kwargs) @@ -580,7 +581,7 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) with torch.no_grad(): optimize_llm_pre(model, qtype, mixed_precision, quantization_group_size=quantization_group_size, - load=bigdl_lcmu_enabled) + load=bigdl_lcmu_enabled, max_prompt_len=max_prompt_len) cls.load_convert(qtype, model, quant_device, modules_to_not_convert, quantization_group_size, *model_args, **kwargs) create_npu_kernels(llm) @@ -804,7 +805,8 @@ def optimize_npu_model(cls, *args, **kwargs): with torch.no_grad(): optimize_llm_pre(model, qtype, mixed_precision, - quantization_group_size=quantization_group_size) + quantization_group_size=quantization_group_size, + max_prompt_len=max_prompt_len) cls.load_convert_fp16(qtype, model.encoder, "cpu", modules_to_not_convert, quantization_group_size) create_npu_kernels(model.encoder) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index 64d6f30b160..4fe29a643ab 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -31,7 +31,7 @@ def convert_forward(m, target_m, new_forward): def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, - quantization_group_size=0, load=False): + quantization_group_size=0, load=False, max_prompt_len=512): if model.config.model_type == "baichuan": # process NormHead module in Baichuan2 7B if hasattr(model, 'lm_head') and model.lm_head is not None: @@ -48,6 +48,13 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, cpu_lm_head = os.environ.get("IPEX_LLM_CPU_LM_HEAD", "0") != "0" + # workaround for long input performance of llama3.2-3b and glm-edge-4b CW + if os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT") is None: + disable_compile_opt = model.config.model_type == "llama" and \ + model.config.hidden_size == 3072 and max_prompt_len >= 1920 \ + and quantization_group_size == 0 + os.environ["IPEX_LLM_NPU_DISABLE_COMPILE_OPT"] = "1" if disable_compile_opt else "0" + # workaround for MiniCPM-2B if model.config.model_type == "minicpm" and model.config.num_hidden_layers == 40: # 73440 is vocab_size of MiniCPM-1B From 7bf16007ea545213603f00e98e30c1af066536cd Mon Sep 17 00:00:00 2001 From: plusbang Date: Fri, 13 Dec 2024 12:00:08 +0800 Subject: [PATCH 2/2] fix code style --- python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index 4fe29a643ab..12c53fc3701 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -51,8 +51,8 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, # workaround for long input performance of llama3.2-3b and glm-edge-4b CW if os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT") is None: disable_compile_opt = model.config.model_type == "llama" and \ - model.config.hidden_size == 3072 and max_prompt_len >= 1920 \ - and quantization_group_size == 0 + model.config.hidden_size == 3072 and max_prompt_len >= 1920 and \ + quantization_group_size == 0 os.environ["IPEX_LLM_NPU_DISABLE_COMPILE_OPT"] = "1" if disable_compile_opt else "0" # workaround for MiniCPM-2B