From 8265ea7ba1c668f3041f1d8aa38ffd650f6e8cc7 Mon Sep 17 00:00:00 2001
From: Yang Wang <yang3.wang@intel.com>
Date: Thu, 26 Oct 2023 21:41:31 +0000
Subject: [PATCH 1/2] Add deepspeed autotp example readme

---
 .../example/GPU/Deepspeed-AutoTP/README.md    | 34 +++++++++++++++++++
 .../llm/example/GPU/Deepspeed-AutoTP/run.sh   |  7 ++--
 2 files changed, 38 insertions(+), 3 deletions(-)
 create mode 100644 python/llm/example/GPU/Deepspeed-AutoTP/README.md

diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/README.md b/python/llm/example/GPU/Deepspeed-AutoTP/README.md
new file mode 100644
index 00000000000..ea47305bee5
--- /dev/null
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/README.md
@@ -0,0 +1,34 @@
+# Optimizing Deepspeed AutoTP with BigDL-LLM on Multiple Intel GPUs
+
+This example demonstrates how to optimize a Deepspeed AutoTP model using BigDL-LLM low-bit optimizations on multiple [Intel GPUs](../README.md).
+
+## 0. Requirements
+To run this example with BigDL-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. For this particular example, you will need at least two GPUs on your machine.
+
+## Example:
+
+### 1. Install
+
+```bash
+conda create -n llm python=3.9
+conda activate llm
+# below command will install intel_extension_for_pytorch==2.0.110+xpu as default
+# you can install specific ipex/torch version for your need
+pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
+pip install oneccl_bind_pt==2.0.100 -f https://developer.intel.com/ipex-whl-stable-xpu
+pip install git+https://github.com/microsoft/DeepSpeed.git@78c518e
+pip install git+https://github.com/intel/intel-extension-for-deepspeed.git@ec33277
+pip install mpi4py
+```
+
+### 2. Configures OneAPI environment variables
+```bash
+source /opt/intel/oneapi/setvars.sh
+```
+
+### 3. Run tensor parallel inference on multiple GPUs
+You many want to change some of the parameters in the script such as `NUM_GPUS`` to the number of GPUs you have on your machine.
+
+```
+bash run.sh
+```
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run.sh
index 972e8c9d247..9c3490a11f9 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run.sh
@@ -1,12 +1,13 @@
 source bigdl-llm-init -t -g
 export MASTER_ADDR=127.0.0.1
 export CCL_ZE_IPC_EXCHANGE=sockets
+NUM_GPUS=4
 if [[ -n $OMP_NUM_THREADS ]]; then
-    export OMP_NUM_THREADS=$(($OMP_NUM_THREADS / 4))
+    export OMP_NUM_THREADS=$(($OMP_NUM_THREADS / $NUM_GPUS))
 else
-    export OMP_NUM_THREADS=$(($(nproc) / 4))
+    export OMP_NUM_THREADS=$(($(nproc) / $NUM_GPUS))
 fi
 torchrun --standalone \
          --nnodes=1 \
-         --nproc-per-node 4 \
+         --nproc-per-node $NUM_GPUS \
          deepspeed_autotp.py --repo-id-or-model-path "meta-llama/Llama-2-7b-hf"

From b550afcb501d49772cb12511b781a9be938e34a8 Mon Sep 17 00:00:00 2001
From: Yang Wang <yang3.wang@intel.com>
Date: Thu, 26 Oct 2023 21:44:42 +0000
Subject: [PATCH 2/2] change word

---
 python/llm/example/GPU/Deepspeed-AutoTP/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/README.md b/python/llm/example/GPU/Deepspeed-AutoTP/README.md
index ea47305bee5..60000b4bbed 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/README.md
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/README.md
@@ -1,6 +1,6 @@
-# Optimizing Deepspeed AutoTP with BigDL-LLM on Multiple Intel GPUs
+# Run BigDL-LLM on Multiple Intel GPUs using DeepSpeed AutoTP
 
-This example demonstrates how to optimize a Deepspeed AutoTP model using BigDL-LLM low-bit optimizations on multiple [Intel GPUs](../README.md).
+This example demonstrates how to run BigDL-LLM optimized low-bit model on multiple [Intel GPUs](../README.md) by leveraging DeepSpeed AutoTP.
 
 ## 0. Requirements
 To run this example with BigDL-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. For this particular example, you will need at least two GPUs on your machine.