diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/glm4/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/glm4/generate.py index 4a8314d09d2..ebef3dae4c2 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/glm4/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/glm4/generate.py @@ -41,6 +41,8 @@ # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format + # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. + # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, @@ -62,10 +64,7 @@ max_new_tokens=args.n_predict) st = time.time() - # if your selected model is capable of utilizing previous key/value attentions - # to enhance decoding speed, but has `"use_cache": false` in its model config, - # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with IPEX-LLM INT4 optimizations + output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/glm4/streamchat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/glm4/streamchat.py index 90c9a7da975..31be35d9639 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/glm4/streamchat.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/glm4/streamchat.py @@ -39,11 +39,14 @@ # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format + # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. + # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = AutoModel.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True, optimize_model=True, - use_cache=True) + use_cache=True, + cpu_embedding=True) model = model.to('xpu') @@ -64,4 +67,3 @@ for response, history in model.stream_chat(tokenizer, args.question, history=[]): print(response.replace(response_, ""), end="") response_ = response - \ No newline at end of file diff --git a/python/llm/example/GPU/PyTorch-Models/Model/glm4/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/glm4/generate.py index 246e52ef635..3433ba29167 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/glm4/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/glm4/generate.py @@ -39,6 +39,8 @@ model_path = args.repo_id_or_model_path # Load model + # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. + # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype = 'auto',