From 5180cba92138fdb91d6c3010de2349d91d314862 Mon Sep 17 00:00:00 2001
From: plusbang <binbin1.deng@intel.com>
Date: Thu, 13 Jun 2024 22:36:28 +0800
Subject: [PATCH 1/2] update

---
 .../GPU/Pipeline-Parallel-Inference/README.md | 44 ++++++++++++++-----
 .../run_llama_arc_2_card.sh                   | 40 +++++++++++++++++
 ..._card.sh => run_qwen1.5_14b_arc_2_card.sh} |  2 +-
 .../src/ipex_llm/transformers/models/qwen2.py |  3 +-
 4 files changed, 77 insertions(+), 12 deletions(-)
 create mode 100644 python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh
 rename python/llm/example/GPU/Pipeline-Parallel-Inference/{run_llama2_13b_arc_2_card.sh => run_qwen1.5_14b_arc_2_card.sh} (91%)
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md b/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md
index c1ffdd96b1e..3fac4d19d77 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md
@@ -6,9 +6,10 @@ This example demonstrates how to run IPEX-LLM optimized low-bit model vertically
 To run this example with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. For this particular example, you will need at least two GPUs on your machine.
 
 ## Verified Models
-- [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
-- [Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)
-- [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
+- [meta-llama/Llama-2-7b-chat-hf](./run_llama_arc_2_card.sh)
+- [meta-llama/Llama-2-13b-chat-hf](./run_llama_arc_2_card.sh)
+- [meta-llama/Meta-Llama-3-8B-Instruct](./run_llama_arc_2_card.sh)
+- [Qwen/Qwen1.5-14B-Chat](./run_qwen1.5_14b_arc_2_card.sh)
 
 ## Example: Run pipeline parallel inference on multiple GPUs
 
@@ -28,18 +29,41 @@ pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.
 
 ### 2. Run pipeline parallel inference on multiple GPUs
 
-For optimal performance, it is recommended to set several environment variables. We provide example usage as following:
+For optimal performance, it is recommended to set several environment variables. We provide example usages as following:
 
-- Run Llama-2-13b-chat-hf on two Intel Arc A770
+</details>
+
+<details>
+  <summary> Show Llama2 and Llama3 example </summary>
+
+#### Run Llama-2-7b-chat-hf / Llama-2-13b-chat-hf / Meta-Llama-3-8B-Instruct on two Intel Arc A770
+
+You could specify `--repo-id-or-model-path` in the test script to be the huggingface repo id for Llama2 / Llama3 to be downloaded, or the path to the huggingface checkpoint folder. Besides, you could change `NUM_GPUS` to the number of GPUs you have on your machine.
+
+```bash
+bash run_llama_arc_2_card.sh
+```
+
+</details>
+
+</details>
+
+<details>
+  <summary> Show Qwen1.5-14B example </summary>
+
+#### Run Qwen1.5-14B-Chat on two Intel Arc A770
+
+You could specify `--repo-id-or-model-path` in the test script to be the huggingface repo id for Qwen1.5 to be downloaded, or the path to the huggingface checkpoint folder. Besides, you could change `NUM_GPUS` to the number of GPUs you have on your machine.
 
 ```bash
-bash run_llama2_13b_arc_2_card.sh
+pip install transformers==4.37.0
+bash run_qwen1.5_14b_arc_2_card.sh
 ```
 
-> **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine.
+</details>
 
-#### Sample Output
-##### [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)
+### 3. Sample Output
+#### [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)
 ```log
 Inference time: xxxx s
 First token cost xxxx s and rest tokens cost average xxxx s
@@ -49,4 +73,4 @@ Once upon a time, there existed a little girl who liked to have adventures. She
 Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. She was always asking her parents to take her on trips, but they were always too busy or too tired.
 
 One day, the little girl
-```
\ No newline at end of file
+```
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh
new file mode 100644
index 00000000000..7f7a467c695
--- /dev/null
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh
@@ -0,0 +1,40 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+source /opt/intel/oneapi/setvars.sh
+export MASTER_ADDR=127.0.0.1
+export MASTER_PORT=9090
+export FI_PROVIDER=tcp
+export USE_XETLA=OFF
+export OMP_NUM_THREADS=6
+if [[ $KERNEL_VERSION != *"6.5"* ]]; then
+    export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+fi
+export TORCH_LLM_ALLREDUCE=0
+
+NUM_GPUS=2 # number of used GPU
+
+# To run Llama-2-7b-chat-hf
+CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
+    generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS
+
+# # To run Llama-2-13b-chat-hf
+# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
+#     generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS
+
+# # To run Meta-Llama-3-8B-Instruct
+# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
+#     generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama2_13b_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_14b_arc_2_card.sh
similarity index 91%
rename from python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama2_13b_arc_2_card.sh
rename to python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_14b_arc_2_card.sh
index 5924aada001..0e450d44cde 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama2_13b_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_14b_arc_2_card.sh
@@ -27,4 +27,4 @@ export TORCH_LLM_ALLREDUCE=0
 
 NUM_GPUS=2 # number of used GPU
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS
diff --git a/python/llm/src/ipex_llm/transformers/models/qwen2.py b/python/llm/src/ipex_llm/transformers/models/qwen2.py
index 2e236acb1d6..6d209aa5567 100644
--- a/python/llm/src/ipex_llm/transformers/models/qwen2.py
+++ b/python/llm/src/ipex_llm/transformers/models/qwen2.py
@@ -73,9 +73,10 @@ def qwen2_model_forward(
     return_dict: Optional[bool] = None,
 ):
     use_cache = use_cache if use_cache is not None else self.config.use_cache
+    input = input_ids if input_ids is not None else inputs_embeds
     use_quantize_kv = (
         self.config.hidden_size != 3584     # disable quantize kv in specific model
-        and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids)
+        and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input)
     )
     if use_cache:
         if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):

From b136e0cdce11748150eed09089a366da60b047f3 Mon Sep 17 00:00:00 2001
From: plusbang <binbin1.deng@intel.com>
Date: Thu, 13 Jun 2024 22:59:50 +0800
Subject: [PATCH 2/2] update

---
 .../example/GPU/Pipeline-Parallel-Inference/README.md    | 9 +++++----
 ...en1.5_14b_arc_2_card.sh => run_qwen1.5_arc_2_card.sh} | 8 +++++++-
 2 files changed, 12 insertions(+), 5 deletions(-)
 rename python/llm/example/GPU/Pipeline-Parallel-Inference/{run_qwen1.5_14b_arc_2_card.sh => run_qwen1.5_arc_2_card.sh} (76%)

diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md b/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md
index 3fac4d19d77..2005fe0f5d7 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md
@@ -9,7 +9,8 @@ To run this example with IPEX-LLM on Intel GPUs, we have some recommended requir
 - [meta-llama/Llama-2-7b-chat-hf](./run_llama_arc_2_card.sh)
 - [meta-llama/Llama-2-13b-chat-hf](./run_llama_arc_2_card.sh)
 - [meta-llama/Meta-Llama-3-8B-Instruct](./run_llama_arc_2_card.sh)
-- [Qwen/Qwen1.5-14B-Chat](./run_qwen1.5_14b_arc_2_card.sh)
+- [Qwen/Qwen1.5-7B-Chat](./run_qwen1.5_arc_2_card.sh)
+- [Qwen/Qwen1.5-14B-Chat](./run_qwen1.5_arc_2_card.sh)
 
 ## Example: Run pipeline parallel inference on multiple GPUs
 
@@ -49,15 +50,15 @@ bash run_llama_arc_2_card.sh
 </details>
 
 <details>
-  <summary> Show Qwen1.5-14B example </summary>
+  <summary> Show Qwen1.5 example </summary>
 
-#### Run Qwen1.5-14B-Chat on two Intel Arc A770
+#### Run Qwen1.5-7B-Chat / Qwen1.5-14B-Chat on two Intel Arc A770
 
 You could specify `--repo-id-or-model-path` in the test script to be the huggingface repo id for Qwen1.5 to be downloaded, or the path to the huggingface checkpoint folder. Besides, you could change `NUM_GPUS` to the number of GPUs you have on your machine.
 
 ```bash
 pip install transformers==4.37.0
-bash run_qwen1.5_14b_arc_2_card.sh
+bash run_qwen1.5_arc_2_card.sh
 ```
 
 </details>
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_14b_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_arc_2_card.sh
similarity index 76%
rename from python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_14b_arc_2_card.sh
rename to python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_arc_2_card.sh
index 0e450d44cde..f3b49bbffc1 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_14b_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_arc_2_card.sh
@@ -26,5 +26,11 @@ fi
 export TORCH_LLM_ALLREDUCE=0
 
 NUM_GPUS=2 # number of used GPU
+
+# To run Qwen1.5-7B-Chat
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS
+
+# # To run Qwen1.5-14B-Chat
+# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
+#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS