diff --git a/docs/mddocs/Quickstart/npu_quickstart.md b/docs/mddocs/Quickstart/npu_quickstart.md index a9150a01a11..0cb15750377 100644 --- a/docs/mddocs/Quickstart/npu_quickstart.md +++ b/docs/mddocs/Quickstart/npu_quickstart.md @@ -90,6 +90,9 @@ For `ipex-llm` NPU support, set the following environment variable with active ` ```cmd set BIGDL_USE_NPU=1 + +:: [optional] for MTL support +set IPEX_LLM_NPU_MTL=1 ``` ## Python API diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index b1ffab77d93..e8ed3db7031 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -60,6 +60,9 @@ For optimal performance, it is recommended to set several environment variables. ```cmd set BIGDL_USE_NPU=1 + +:: [optional] for running models on MTL +set IPEX_LLM_NPU_MTL=1 ``` ## 3. Run Models diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index 12c53fc3701..2c145c536b7 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -32,6 +32,11 @@ def convert_forward(m, target_m, new_forward): def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, quantization_group_size=0, load=False, max_prompt_len=512): + if os.environ.get("IPEX_LLM_NPU_MTL", "0") == "1": + # For MTL support + os.environ["IPEX_LLM_NPU_USE_LEVEL0"] = "0" + os.environ["IPEX_LLM_NPU_DISABLE_COMPILE_OPT"] = "1" + if model.config.model_type == "baichuan": # process NormHead module in Baichuan2 7B if hasattr(model, 'lm_head') and model.lm_head is not None: