fix

intel-analytics · Jun 12, 2024 · 222e966 · 222e966
1 parent 7f42e6b
commit 222e966
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -154,6 +154,7 @@ Over 50 models have been optimized/verified on `ipex-llm`, including *LLaMA/LLaM
 | ChatGLM    | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm)   |    | 
 | ChatGLM2   | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2)  | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2)   |
 | ChatGLM3   | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3)  | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3)   |
+| GLM-4      | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/glm4)      | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/glm4)       |
 | Mistral    | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral)   | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral)    |
 | Mixtral    | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral)   | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral)    |
 | Falcon     | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon)    | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon)     |

diff --git a/docs/readthedocs/source/index.rst b/docs/readthedocs/source/index.rst
@@ -257,6 +257,13 @@ Verified Models
          <td>
            <a href="https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3">link</a></td>
        </tr>
+       <tr>
+         <td>GLM-4</td>
+         <td>
+           <a href="https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/glm4">link</a></td>
+         <td>
+           <a href="https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/HF-Transformers-AutoModels/Model/glm4">link</a></td>
+       </tr>
        <tr>
          <td>Mistral</td>
          <td>

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/glm4/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/glm4/README.md
@@ -19,7 +19,8 @@ conda activate llm
 # install the latest ipex-llm nightly build with 'all' option
 pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu
 
-pip install tiktoken  # additional package required for GLM-4 to conduct generation
+# install tiktoken required for GLM-4
+pip install tiktoken
 ```
 
 On Windows:
@@ -29,6 +30,9 @@ conda create -n llm python=3.11
 conda activate llm
 
 pip install --pre --upgrade ipex-llm[all]
+
+# install tiktoken required for GLM-4
+pip install tiktoken
 ```
 
 ### 2. Run
@@ -65,7 +69,7 @@ numactl -C 0-47 -m 0 python ./generate.py
 ```
 
 #### 2.3 Sample Output
-#### [THUDM/glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat)
+##### [THUDM/glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat)
 ```log
 Inference time: xxxx s
 -------------------- Prompt --------------------
@@ -105,6 +109,9 @@ conda activate llm
 
 # install the latest ipex-llm nightly build with 'all' option
 pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu
+
+# install tiktoken required for GLM-4
+pip install tiktoken
 ```
 
 On Windows:
@@ -114,6 +121,9 @@ conda create -n llm python=3.11
 conda activate llm
 
 pip install --pre --upgrade ipex-llm[all]
+
+# install tiktoken required for GLM-4
+pip install tiktoken
 ```
 
 ### 2. Run

diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/glm4/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/glm4/generate.py
@@ -43,7 +43,9 @@
     # which convert the relevant layers in the model into INT4 format
     model = AutoModel.from_pretrained(model_path,
                                       load_in_4bit=True,
-                                      trust_remote_code=True)
+                                      optimize_model=True,
+                                      trust_remote_code=True,
+                                      use_cache=True)
 
     # Load tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_path,
@@ -54,10 +56,6 @@
         prompt = GLM4_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt")
         st = time.time()
-        # if your selected model is capable of utilizing previous key/value attentions
-        # to enhance decoding speed, but has `"use_cache": false` in its model config,
-        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()

diff --git a/python/llm/example/CPU/PyTorch-Models/Model/glm4/README.md b/python/llm/example/CPU/PyTorch-Models/Model/glm4/README.md
@@ -20,7 +20,8 @@ conda activate llm
 # install the latest ipex-llm nightly build with 'all' option
 pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu
 
-pip install tiktoken  # additional package required for GLM-4 to conduct generation
+# install tiktoken required for GLM-4
+pip install tiktoken
 ```
 
 On Windows:
@@ -30,6 +31,9 @@ conda create -n llm python=3.11
 conda activate llm
 
 pip install --pre --upgrade ipex-llm[all]
+
+# install tiktoken required for GLM-4
+pip install tiktoken
 ```
 
 ### 2. Run
@@ -73,3 +77,12 @@ AI是什么？
 
 AI，即人工智能（Artificial Intelligence），是指由人创造出来的，能够模拟、延伸和扩展人的智能的计算机系统或机器。人工智能技术
 ```
+
+```
+Inference time: xxxx s
+-------------------- Output --------------------
+
+What is AI?
+
+Artificial Intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions. The term "art
+```
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/glm4/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/glm4/generate.py
@@ -39,7 +39,11 @@
     model_path = args.repo_id_or_model_path
 
     # Load model
-    model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+    model = AutoModel.from_pretrained(model_path,
+                                      trust_remote_code=True,
+                                      torch_dtype='auto',
+                                      low_cpu_mem_usage=True,
+                                      use_cache=True)
 
     # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)