intel-analytics · plusbang · Jun 13, 2024 · Jun 13, 2024 · Jun 13, 2024
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md b/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md
@@ -6,9 +6,10 @@ This example demonstrates how to run IPEX-LLM optimized low-bit model vertically
 To run this example with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. For this particular example, you will need at least two GPUs on your machine.
 
 ## Verified Models
-- [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
-- [Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)
-- [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
+- [meta-llama/Llama-2-7b-chat-hf](./run_llama_arc_2_card.sh)
+- [meta-llama/Llama-2-13b-chat-hf](./run_llama_arc_2_card.sh)
+- [meta-llama/Meta-Llama-3-8B-Instruct](./run_llama_arc_2_card.sh)
+- [Qwen/Qwen1.5-14B-Chat](./run_qwen1.5_14b_arc_2_card.sh)
 
 ## Example: Run pipeline parallel inference on multiple GPUs
 
@@ -28,18 +29,41 @@ pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.
 
 ### 2. Run pipeline parallel inference on multiple GPUs
 
-For optimal performance, it is recommended to set several environment variables. We provide example usage as following:
+For optimal performance, it is recommended to set several environment variables. We provide example usages as following:
 
-- Run Llama-2-13b-chat-hf on two Intel Arc A770
+</details>
+
+<details>
+  <summary> Show Llama2 and Llama3 example </summary>
+
+#### Run Llama-2-7b-chat-hf / Llama-2-13b-chat-hf / Meta-Llama-3-8B-Instruct on two Intel Arc A770
+
+You could specify `--repo-id-or-model-path` in the test script to be the huggingface repo id for Llama2 / Llama3 to be downloaded, or the path to the huggingface checkpoint folder. Besides, you could change `NUM_GPUS` to the number of GPUs you have on your machine.
+
+```bash
+bash run_llama_arc_2_card.sh
+```
+
+</details>
+
+</details>
+
+<details>
+  <summary> Show Qwen1.5-14B example </summary>
+
+#### Run Qwen1.5-14B-Chat on two Intel Arc A770
+
+You could specify `--repo-id-or-model-path` in the test script to be the huggingface repo id for Qwen1.5 to be downloaded, or the path to the huggingface checkpoint folder. Besides, you could change `NUM_GPUS` to the number of GPUs you have on your machine.
 
 ```bash
-bash run_llama2_13b_arc_2_card.sh
+pip install transformers==4.37.0
+bash run_qwen1.5_14b_arc_2_card.sh
 ```
 
-> **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine.
+</details>
 
-#### Sample Output
-##### [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)
+### 3. Sample Output
+#### [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)
 ```log
 Inference time: xxxx s
 First token cost xxxx s and rest tokens cost average xxxx s
@@ -49,4 +73,4 @@ Once upon a time, there existed a little girl who liked to have adventures. She
 Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. She was always asking her parents to take her on trips, but they were always too busy or too tired.
 
 One day, the little girl
-```
+```
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh
@@ -0,0 +1,40 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+source /opt/intel/oneapi/setvars.sh
+export MASTER_ADDR=127.0.0.1
+export MASTER_PORT=9090
+export FI_PROVIDER=tcp
+export USE_XETLA=OFF
+export OMP_NUM_THREADS=6
+if [[ $KERNEL_VERSION != *"6.5"* ]]; then
+    export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+fi
+export TORCH_LLM_ALLREDUCE=0
+
+NUM_GPUS=2 # number of used GPU
+
+# To run Llama-2-7b-chat-hf
+CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
+    generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS
+
+# # To run Llama-2-13b-chat-hf
+# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
+#     generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS
+
+# # To run Meta-Llama-3-8B-Instruct
+# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
+#     generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS
diff --git a/...el-Inference/run_llama2_13b_arc_2_card.sh → ...l-Inference/run_qwen1.5_14b_arc_2_card.sh b/...el-Inference/run_llama2_13b_arc_2_card.sh → ...l-Inference/run_qwen1.5_14b_arc_2_card.sh
@@ -27,4 +27,4 @@ export TORCH_LLM_ALLREDUCE=0
 
 NUM_GPUS=2 # number of used GPU
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS
diff --git a/python/llm/src/ipex_llm/transformers/models/qwen2.py b/python/llm/src/ipex_llm/transformers/models/qwen2.py
@@ -73,9 +73,10 @@ def qwen2_model_forward(
     return_dict: Optional[bool] = None,
 ):
     use_cache = use_cache if use_cache is not None else self.config.use_cache
+    input = input_ids if input_ids is not None else inputs_embeds
     use_quantize_kv = (
         self.config.hidden_size != 3584     # disable quantize kv in specific model
-        and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids)
+        and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input)
     )
     if use_cache:
         if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):