diff --git a/.github/actions/llm/setup-llm-env-4.36/action.yml b/.github/actions/llm/setup-llm-env-4.36/action.yml
new file mode 100644
index 00000000000..4d0b7550f74
--- /dev/null
+++ b/.github/actions/llm/setup-llm-env-4.36/action.yml
@@ -0,0 +1,45 @@
+name: "Setup IPEX-LLM Env"
+description: "IPEX-LLM installation"
+inputs:
+  extra-dependency:
+    description: "Name of extra dependencies filled in brackets"
+    required: false
+    default: "all"
+runs:
+  using: "composite"
+  steps:
+    - name: Create conda env for llm tests and conduct install tests
+      shell: bash
+      run: |
+        # make sure we install the latest version for bigdl-core-xe related packages
+        pip uninstall bigdl-core-xe -y || true
+        pip uninstall bigdl-core-xe-esimd -y || true
+        pip uninstall bigdl-core-xe-21 -y || true
+        pip uninstall bigdl-core-xe-esimd-21 -y || true
+        sed -i 's/"bigdl-core-xe==" + CORE_XE_VERSION + "/"bigdl-core-xe/g' python/llm/setup.py
+        sed -i 's/"bigdl-core-xe-esimd==" + CORE_XE_VERSION + "/"bigdl-core-xe-esimd/g' python/llm/setup.py
+        sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
+        sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
+
+        pip install requests
+        if [[ ${{ runner.os }} == 'Linux' ]]; then
+          bash python/llm/dev/release_default_linux.sh default false
+        elif [[ ${{ runner.os }} == 'Windows' ]]; then
+          bash python/llm/dev/release_default_windows.sh default false
+        else
+          echo "Runner os is not supported!!!!!"
+          exit 1
+        fi
+        whl_name=$(ls python/llm/dist)
+        if [[ ${{ inputs.extra-dependency }} == 'xpu_2.0' ]]; then
+          pip install --upgrade --pre -i https://pypi.python.org/simple --force-reinstall "python/llm/dist/${whl_name}[xpu_2.0]" --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ 
+          pip install pytest expecttest
+        elif [[ ${{ inputs.extra-dependency }} == 'xpu_2.1' ]]; then
+          pip install --upgrade --pre -i https://pypi.python.org/simple --force-reinstall "python/llm/dist/${whl_name}[xpu_2.1]" --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ 
+          pip install pytest expecttest
+        else
+          pip install --upgrade --pre -i https://pypi.python.org/simple --force-reinstall "python/llm/dist/${whl_name}[all]"
+          pip install pytest
+          bash python/llm/test/run-llm-install-tests.sh
+        fi
+        pip install transformers==4.36.2
diff --git a/.github/workflows/llm_unit_tests_4.36.yml b/.github/workflows/llm_unit_tests_4.36.yml
new file mode 100644
index 00000000000..3e8d8140c45
--- /dev/null
+++ b/.github/workflows/llm_unit_tests_4.36.yml
@@ -0,0 +1,405 @@
+name: LLM Unit Tests 4.36
+
+# Cancel previous runs in the PR when you push new commits
+concurrency:
+  group: ${{ github.workflow }}-llm-unittest-${{ github.event.pull_request.number || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+# Controls when the action will run.
+on:
+  # Triggers the workflow on push or pull request events but only for the main branch
+  push:
+    branches: [main]
+    paths:
+      - "python/llm/**"
+      - ".github/workflows/llm_unit_tests.yml"
+      - ".github/workflows/llm-binary-build.yml"
+      - ".github/actions/llm/setup-llm-env/action.yml"
+      - ".github/actions/llm/remove-llm-env/action.yml"
+      - ".github/actions/llm/cli-test-linux/action.yml"
+      - ".github/actions/llm/cli-test-windows/action.yml"
+      - ".github/actions/llm/download-llm-binary/action.yml"
+  pull_request:
+    branches: [main]
+    paths:
+      - "python/llm/**"
+      - ".github/workflows/llm_unit_tests.yml"
+      - ".github/workflows/llm-binary-build.yml"
+      - ".github/actions/llm/setup-llm-env/action.yml"
+      - ".github/actions/llm/remove-llm-env/action.yml"
+      - ".github/actions/llm/cli-test-linux/action.yml"
+      - ".github/actions/llm/cli-test-windows/action.yml"
+      - ".github/actions/llm/download-llm-binary/action.yml"
+  workflow_dispatch:
+  workflow_call:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  llm-cpp-build:
+    uses: ./.github/workflows/llm-binary-build.yml
+  setup-python-version:
+    runs-on: ubuntu-latest
+    outputs:
+      python-version: ${{ steps.setup-python-version.outputs.python-version }}
+    steps:
+      - name: setup-python-version
+        id: setup-python-version
+        run: |
+          if [ ${{ github.event_name }} == 'schedule' ]; then
+            python_version='["3.9", "3.10", "3.11"]'
+          else
+            python_version='["3.11"]'
+          fi
+          list=$(echo ${python_version} | jq -c)
+          echo "python-version=${list}" >> "$GITHUB_OUTPUT"
+  llm-unit-test:
+    needs: [setup-python-version, llm-cpp-build]
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [windows, ubuntu-20.04-lts]
+        python-version: ${{ fromJson(needs.setup-python-version.outputs.python-version) }}
+        include:
+          - os: windows
+            instruction: AVX-VNNI-UT
+          - os: ubuntu-20.04-lts
+            instruction: avx512
+    runs-on: [self-hosted, llm, "${{matrix.instruction}}", "${{matrix.os}}"]
+    env:
+      THREAD_NUM: 24
+      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
+    steps:
+      - name: Set model directories
+        shell: bash
+        run: |
+          echo "DATASET_DIR=${{ github.workspace }}/../llm/datasets" >> "$GITHUB_ENV"
+          echo "ORIGIN_DIR=${{ github.workspace }}/../llm/origin-models" >> "$GITHUB_ENV"
+          echo "ORIGIN_DIR_436=${{ github.workspace }}/../llm/origin-models-4.36" >> "$GITHUB_ENV"
+          echo "INT4_CKPT_DIR=${{ github.workspace }}/../llm/converted-models" >> "$GITHUB_ENV"
+      - name: Create model directories
+        shell: bash
+        run: |
+          if [ ! -d $DATASET_DIR ]; then
+            mkdir -p $DATASET_DIR
+          fi
+          if [ ! -d $ORIGIN_DIR ]; then
+            mkdir -p $ORIGIN_DIR
+          fi
+          if [ ! -d ORIGIN_DIR_436 ]; then
+            mkdir -p ORIGIN_DIR_436
+          fi
+          if [ ! -d $INT4_CKPT_DIR ]; then
+            mkdir -p $INT4_CKPT_DIR
+          fi
+      - name: Set environment variables
+        shell: bash
+        run: |
+          echo "SPEECH_DATASET_PATH=${DATASET_DIR}/librispeech_asr_dummy" >> "$GITHUB_ENV"
+          echo "COMMON_VOICE_PATH=${DATASET_DIR}/common_voice" >> "$GITHUB_ENV"
+
+          echo "LLAMA_ORIGIN_PATH=${ORIGIN_DIR}/llama-7b-hf" >> "$GITHUB_ENV"
+          echo "BLOOM_ORIGIN_PATH=${ORIGIN_DIR}/bloom-7b1" >> "$GITHUB_ENV"
+          echo "ORIGINAL_CHATGLM2_6B_PATH=${ORIGIN_DIR_436}/chatglm2-6b" >> "$GITHUB_ENV"
+          echo "ORIGINAL_REPLIT_CODE_PATH=${ORIGIN_DIR}/replit-code-v1-3b" >> "$GITHUB_ENV"
+          echo "ORIGINAL_WHISPER_TINY_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV"
+          echo "MISTRAL_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-v0.1" >> "$GITHUB_ENV"
+          echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
+          echo "VICUNA_7B_1_3_ORIGIN_PATH=${ORIGIN_DIR}/vicuna-7b-v1.3" >> "$GITHUB_ENV"
+
+          echo "LLAMA_INT4_CKPT_PATH=${INT4_CKPT_DIR}/bigdl_llm_llama_7b_q4_0.bin" >> "$GITHUB_ENV"
+          echo "GPTNEOX_INT4_CKPT_PATH=${INT4_CKPT_DIR}/bigdl_llm_redpajama_7b_q4_0.bin" >> "$GITHUB_ENV"
+          echo "BLOOM_INT4_CKPT_PATH=${INT4_CKPT_DIR}/bigdl_llm_bloom_7b_q4_0.bin" >> "$GITHUB_ENV"
+          echo "STARCODER_INT4_CKPT_PATH=${INT4_CKPT_DIR}/bigdl_llm_santacoder_1b_q4_0.bin" >> "$GITHUB_ENV"
+          echo "CHATGLM_INT4_CKPT_PATH=${INT4_CKPT_DIR}/chatglm2-6b-q4_0.bin" >> "$GITHUB_ENV"
+      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        shell: bash
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade setuptools==58.0.4
+          python -m pip install --upgrade wheel
+
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
+
+      - name: Run LLM install (all) test
+        uses: ./.github/actions/llm/setup-llm-env-436
+
+      - name: Download ckpt & original models
+        shell: bash
+        run: |
+          if [ ! -e $LLAMA_INT4_CKPT_PATH ]; then
+            echo "Directory $LLAMA_INT4_CKPT_PATH not found. Downloading from FTP server..."
+            echo "wget --no-verbose $LLM_FTP_URL/llm/ggml-actions/stable/bigdl_llm_llama_7b_q4_0.bin -P $INT4_CKPT_DIR"
+            wget --no-verbose $LLM_FTP_URL/llm/ggml-actions/stable/bigdl_llm_llama_7b_q4_0.bin -P $INT4_CKPT_DIR
+          fi
+          if [ ! -e $GPTNEOX_INT4_CKPT_PATH ]; then
+            echo "Directory $GPTNEOX_INT4_CKPT_PATH not found. Downloading from FTP server..."
+            wget --no-verbose $LLM_FTP_URL/llm/ggml-actions/stable/bigdl_llm_redpajama_7b_q4_0.bin -P $INT4_CKPT_DIR
+          fi
+          if [ ! -e $BLOOM_INT4_CKPT_PATH ]; then
+            echo "Directory $BLOOM_INT4_CKPT_PATH not found. Downloading from FTP server..."
+            wget --no-verbose $LLM_FTP_URL/llm/ggml-actions/stable/bigdl_llm_bloom_7b_q4_0.bin -P $INT4_CKPT_DIR
+          fi
+          if [ ! -e $STARCODER_INT4_CKPT_PATH ]; then
+            echo "Directory $STARCODER_INT4_CKPT_PATH not found. Downloading from FTP server..."
+            wget --no-verbose $LLM_FTP_URL/llm/ggml-actions/stable/bigdl_llm_santacoder_1b_q4_0.bin -P $INT4_CKPT_DIR
+          fi
+          # if [ ! -e $CHATGLM_INT4_CKPT_PATH ]; then
+          #   echo "Directory $CHATGLM_INT4_CKPT_PATH not found. Downloading from FTP server..."
+          #   wget --no-verbose $LLM_FTP_URL/llm/ggml-actions/stable/chatglm2-6b-q4_0.bin -P $INT4_CKPT_DIR
+          # fi
+          if [ ! -d $ORIGINAL_CHATGLM2_6B_PATH ]; then
+            echo "Directory $ORIGINAL_CHATGLM2_6B_PATH not found. Downloading from FTP server..."
+            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436"
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR
+          fi
+          if [ ! -d $ORIGINAL_REPLIT_CODE_PATH ]; then
+            echo "Directory $ORIGINAL_REPLIT_CODE_PATH not found. Downloading from FTP server..."
+            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/replit-code-v1-3b -P $ORIGIN_DIR"
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/replit-code-v1-3b -P $ORIGIN_DIR
+          fi
+          if [ ! -d $ORIGINAL_WHISPER_TINY_PATH ]; then
+            echo "Directory $ORIGINAL_WHISPER_TINY_PATH not found. Downloading from FTP server..."
+            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/whisper-tiny -P $ORIGIN_DIR"
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/whisper-tiny -P $ORIGIN_DIR
+          fi
+          if [ ! -d $MISTRAL_ORIGIN_PATH ]; then
+            echo "Directory $MISTRAL_ORIGIN_PATH not found. Downloading from FTP server..."
+            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Mistral-7B-v0.1 -P $ORIGIN_DIR"
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Mistral-7B-v0.1 -P $ORIGIN_DIR
+          fi
+          if [ ! -d $LLAMA_ORIGIN_PATH ]; then
+            echo "Directory $LLAMA_ORIGIN_PATH not found. Downloading from FTP server..."
+            echo "wget --no-verbose $LLM_FTP_URL/llm/llama-7b-hf -P $ORIGIN_DIR"
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/llama-7b-hf -P $ORIGIN_DIR
+          fi
+          if [ ! -d $BLOOM_ORIGIN_PATH ]; then
+            echo "Directory $BLOOM_ORIGIN_PATH not found. Downloading from FTP server..."
+            echo "wget --no-verbose $LLM_FTP_URL/llm/bloom-7b1 -P $ORIGIN_DIR"
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/bloom-7b1 -P $ORIGIN_DIR
+          fi
+          if [ ! -d $SPEECH_DATASET_PATH ]; then
+            echo "Directory $SPEECH_DATASET_PATH not found. Downloading from FTP server..."
+            echo "wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/datasets/librispeech_asr_dummy -P $DATASET_DIR"
+            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/datasets/librispeech_asr_dummy -P $DATASET_DIR
+          fi
+          if [ ! -d $COMMON_VOICE_PATH ]; then
+            echo "Directory $COMMON_VOICE_PATH not found. Downloading from FTP server..."
+            echo "wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/datasets/common_voice -P $DATASET_DIR"
+            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/datasets/common_voice -P $DATASET_DIR
+          fi
+          if [ ! -d $LLAMA2_7B_ORIGIN_PATH ]; then
+            echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR
+          fi
+          if [ ! -d $VICUNA_7B_1_3_ORIGIN_PATH ]; then
+            echo "Directory $VICUNA_7B_1_3_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/vicuna-7b-v1.3 -P $ORIGIN_DIR
+          fi
+
+      - name: Run LLM cli test (Linux)
+        if: runner.os == 'Linux' 
+        uses: ./.github/actions/llm/cli-test-linux
+      - name: Run LLM cli test (Windows)
+        if: runner.os == 'Windows' 
+        uses: ./.github/actions/llm/cli-test-windows
+      - name: Run LLM inference test
+        shell: bash
+        run: |
+          python -m pip install einops datasets librosa openai-whisper
+          bash python/llm/test/run-llm-inference-tests.sh
+      - name: Run LLM langchain test
+        shell: bash
+        run: |
+          pip install -U langchain==0.0.184
+          pip install -U chromadb==0.3.25
+          pip install -U pandas==2.0.3
+          bash python/llm/test/run-llm-langchain-tests.sh
+      - name: Run LLM llamaindex test
+        shell: bash
+        run: |
+          pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface
+          pip install transformers==4.36.2
+          pip install "pydantic>=2.0.0"
+          bash python/llm/test/run-llm-llamaindex-tests.sh
+  llm-unit-test-on-arc:
+    needs: [setup-python-version, llm-cpp-build]
+    strategy:
+      fail-fast: false
+      matrix:
+        pytorch-version: ['2.1', '2.0']
+        python-version: ${{ fromJson(needs.setup-python-version.outputs.python-version) }}
+    runs-on: [self-hosted, llm, arc-ut]
+    env:
+      # OMP_NUM_THREADS: 16
+      # THREAD_NUM: 16
+      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
+    steps:
+      - name: Set environment variables
+        shell: bash
+        run: |
+          echo "DATASET_DIR=${ORIGIN_DIR}/../datasets" >> "$GITHUB_ENV"
+          echo "YAHMA_ALPACA_CLEANED_PATH=${ORIGIN_DIR}/../datasets/yahma_alpaca_cleaned" >> "$GITHUB_ENV"
+          echo "SPEECH_DATASET_PATH=${ORIGIN_DIR}/../datasets/librispeech_asr_dummy" >> "$GITHUB_ENV"
+
+          echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
+          echo "CHATGLM2_6B_ORIGIN_PATH=${ORIGIN_DIR_436}/chatglm2-6b" >> "$GITHUB_ENV"
+          echo "FALCON_7B_ORIGIN_PATH=${ORIGIN_DIR}/falcon-7b-instruct-with-patch" >> "$GITHUB_ENV"
+          echo "MPT_7B_ORIGIN_PATH=${ORIGIN_DIR_436}/mpt-7b-chat" >> "$GITHUB_ENV"
+          echo "WHISPER_TINY_ORIGIN_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV"
+          echo "MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-Instruct-v0.1" >> "$GITHUB_ENV"
+          echo "BAICHUAN2_7B_ORIGIN_PATH=${ORIGIN_DIR_436}/Baichuan2-7B-Chat" >> "$GITHUB_ENV"
+          echo "QWEN_7B_ORIGIN_PATH=${ORIGIN_DIR}/Qwen-7B-Chat" >> "$GITHUB_ENV"
+          echo "VICUNA_7B_1_3_ORIGIN_PATH=${ORIGIN_DIR}/vicuna-7b-v1.3" >> "$GITHUB_ENV"
+      - name: Checkout repo
+        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        shell: bash
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade setuptools
+          python -m pip install --upgrade wheel
+          python -m pip install --upgrade notebook
+
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
+
+      - name: Install IPEX-LLM for xpu
+        uses: ./.github/actions/llm/setup-llm-env-436
+        with:
+          extra-dependency: "xpu_${{ matrix.pytorch-version }}"
+
+      - name: Test installed xpu version
+        shell: bash
+        run: |
+          # Specific oneapi position on arc ut test machines
+          if [[ '${{ matrix.pytorch-version }}' == '2.1' ]]; then
+            source /opt/intel/oneapi/setvars.sh
+          elif [[ '${{ matrix.pytorch-version }}' == '2.0' ]]; then
+            source /home/arda/intel/oneapi/setvars.sh
+          fi
+          bash python/llm/test/run-llm-install-tests.sh
+
+      - name: Download LLMs and datasets
+        shell: bash
+        run: |
+          if [ ! -d $LLAMA2_7B_ORIGIN_PATH ]; then
+            echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR
+          fi
+          if [ ! -d $CHATGLM2_6B_ORIGIN_PATH ]; then
+            echo "Directory $CHATGLM2_6B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436
+          fi
+          if [ ! -d $FALCON_7B_ORIGIN_PATH ]; then
+            echo "Directory $FALCON_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/falcon-7b-instruct-with-patch -P $ORIGIN_DIR
+          fi
+          if [ ! -d $MPT_7B_ORIGIN_PATH ]; then
+            echo "Directory $MPT_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36/mpt-7b-chat -P $ORIGIN_DIR_436
+          fi
+          if [ ! -d $WHISPER_TINY_ORIGIN_PATH ]; then
+            echo "Directory $WHISPER_TINY_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/whisper-tiny -P $ORIGIN_DIR
+          fi
+          if [ ! -d $DATASET_DIR ]; then
+            mkdir -p $DATASET_DIR
+          fi
+          if [ ! -d $YAHMA_ALPACA_CLEANED_PATH ]; then
+            echo "Directory $YAHMA_ALPACA_CLEANED_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/datasets/yahma_alpaca_cleaned -P $DATASET_DIR
+          fi
+          if [ ! -d $SPEECH_DATASET_PATH ]; then
+            echo "Directory $SPEECH_DATASET_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/datasets/librispeech_asr_dummy -P $DATASET_DIR
+          fi
+          if [ ! -d $MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH ]; then
+            echo "Directory $MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Mistral-7B-Instruct-v0.1 -P $ORIGIN_DIR
+          fi
+          if [ ! -d $QWEN_7B_ORIGIN_PATH ]; then
+            echo "Directory $QWEN_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Qwen-7B-Chat -P $ORIGIN_DIR
+          fi
+          if [ ! -d $BAICHUAN2_7B_ORIGIN_PATH ]; then
+            echo "Directory $BAICHUAN2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36/Baichuan2-7B-Chat -P $ORIGIN_DIR_436
+          fi
+          if [ ! -d $VICUNA_7B_1_3_ORIGIN_PATH ]; then
+            echo "Directory $VICUNA_7B_1_3_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/vicuna-7b-v1.3 -P $ORIGIN_DIR
+          fi
+          
+      - name: Run LLM inference test
+        shell: bash
+        run: |
+          # Specific oneapi position on arc ut test machines
+          if [[ '${{ matrix.pytorch-version }}' == '2.1' ]]; then
+            source /opt/intel/oneapi/setvars.sh
+          elif [[ '${{ matrix.pytorch-version }}' == '2.0' ]]; then
+            source /home/arda/intel/oneapi/setvars.sh
+          fi
+          python -m pip install datasets librosa soundfile einops tiktoken transformers_stream_generator
+          bash python/llm/test/run-llm-inference-tests-gpu-436.sh
+
+      - name: Run LLM example tests
+        shell: bash
+        run: |
+          python -m pip uninstall datasets -y
+          python -m pip install transformers==4.34.0 datasets peft==0.5.0 accelerate==0.23.0
+          python -m pip install bitsandbytes scipy
+          # Specific oneapi position on arc ut test machines
+          if [[ '${{ matrix.pytorch-version }}' == '2.1' ]]; then
+            source /opt/intel/oneapi/setvars.sh
+          elif [[ '${{ matrix.pytorch-version }}' == '2.0' ]]; then
+            source /home/arda/intel/oneapi/setvars.sh
+          fi
+          bash python/llm/test/run-llm-example-tests-gpu.sh
+
+      - name: Run LLM langchain GPU test
+        shell: bash
+        run: |
+          pip install -U langchain==0.0.184
+          pip install -U chromadb==0.3.25
+          pip install -U pandas==2.0.3
+          # Specific oneapi position on arc ut test machines
+          if [[ '${{ matrix.pytorch-version }}' == '2.1' ]]; then
+            source /opt/intel/oneapi/setvars.sh
+          elif [[ '${{ matrix.pytorch-version }}' == '2.0' ]]; then
+            source /home/arda/intel/oneapi/setvars.sh
+          fi
+          bash python/llm/test/run-llm-langchain-tests-gpu.sh
+          
+          pip install -U langchain
+          pip install -U langchain-community
+          bash python/llm/test/run-langchain-upstream-tests.sh
+
+      - name: Run LLM llamaindex GPU test
+        shell: bash
+        run: |
+          pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface
+          # Specific oneapi position on arc ut test machines
+          if [[ '${{ matrix.pytorch-version }}' == '2.1' ]]; then
+#            pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+            source /opt/intel/oneapi/setvars.sh
+          elif [[ '${{ matrix.pytorch-version }}' == '2.0' ]]; then
+#            pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+            source /home/arda/intel/oneapi/setvars.sh
+          fi
+          pip install "pydantic>=2.0.0"
+          bash python/llm/test/run-llm-llamaindex-tests-gpu.sh
\ No newline at end of file
diff --git a/python/llm/test/inference/test_transformers_api_436.py b/python/llm/test/inference/test_transformers_api_436.py
new file mode 100644
index 00000000000..9d528e11631
--- /dev/null
+++ b/python/llm/test/inference/test_transformers_api_436.py
@@ -0,0 +1,153 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import unittest
+import os
+import tempfile
+import time
+import torch
+import pytest
+
+from ipex_llm.transformers import AutoModel, AutoModelForCausalLM, AutoModelForSpeechSeq2Seq
+from transformers import AutoTokenizer, LlamaTokenizer
+
+class TestTransformersAPI(unittest.TestCase):
+
+    def setUp(self):        
+        thread_num = os.environ.get('THREAD_NUM')
+        if thread_num is not None:
+            self.n_threads = int(thread_num)
+        else:
+            self.n_threads = 2
+
+    def test_transformers_auto_model_int4(self):
+        model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')
+        model = AutoModel.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        input_str = "Tell me the capital of France.\n\n"
+
+        with torch.inference_mode():
+            st = time.time()
+            input_ids = tokenizer.encode(input_str, return_tensors="pt")
+            output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
+            output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+            end = time.time()
+        print('Prompt:', input_str)
+        print('Output:', output_str)
+        print(f'Inference time: {end-st} s')
+        res = 'Paris' in output_str        
+        self.assertTrue(res)
+        
+
+    def test_transformers_auto_model_for_speech_seq2seq_int4(self):
+        from transformers import WhisperProcessor, WhisperForConditionalGeneration
+        from datasets import load_from_disk
+        model_path = os.environ.get('ORIGINAL_WHISPER_TINY_PATH')
+        dataset_path = os.environ.get('SPEECH_DATASET_PATH')
+        processor = WhisperProcessor.from_pretrained(model_path)
+        ds = load_from_disk(dataset_path)
+        sample = ds[0]["audio"]
+        input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
+        with torch.inference_mode():
+            st = time.time()
+            predicted_ids = model.generate(input_features)
+            # decode token ids to text
+            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
+            end = time.time()        
+        print('Output:', transcription)
+        print(f'Inference time: {end-st} s')
+        res = 'Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.' in transcription[0]
+        self.assertTrue(res)
+
+    def test_transformers_chatglm_for_causallm(self):
+        from ipex_llm.transformers import ChatGLMForCausalLM
+        model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')
+        model = ChatGLMForCausalLM.from_pretrained(model_path, native=False, trust_remote_code=True, load_in_4bit=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        input_str = "Tell me the capital of France.\n\n"
+
+        with torch.inference_mode():
+            st = time.time()
+            input_ids = tokenizer.encode(input_str, return_tensors="pt")
+            output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
+            output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+            end = time.time()
+        print('Prompt:', input_str)
+        print('Output:', output_str)
+        print(f'Inference time: {end-st} s')
+        res = 'Paris' in output_str        
+        self.assertTrue(res)
+
+@pytest.mark.parametrize('prompt, answer', [
+    ('What is the capital of France?\n\n', 'Paris')
+    ])
+@pytest.mark.parametrize('Model, Tokenizer, model_path',[
+    (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')),
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH')),
+    ])
+def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
+    tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model = Model.from_pretrained(model_path,
+                                  load_in_4bit=True,
+                                  optimize_model=True,
+                                  trust_remote_code=True)
+    
+    with tempfile.TemporaryDirectory() as tempdir:
+        model.save_low_bit(tempdir)
+        loaded_model = Model.load_low_bit(tempdir,
+                                          optimize_model=True,
+                                          trust_remote_code=True)
+
+        with torch.inference_mode():
+            input_ids = tokenizer.encode(prompt, return_tensors="pt")
+            output = loaded_model.generate(input_ids, max_new_tokens=32)
+            output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+
+            assert answer in output_str
+
+prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
+
+@pytest.mark.parametrize("Model, Tokenizer, model_path, prompt", [
+    (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt),
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt),
+    (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt),
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH'), prompt),
+])
+    
+def test_optimize_model(Model, Tokenizer, model_path, prompt):
+    tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
+    input_ids = tokenizer.encode(prompt, return_tensors="pt")
+
+    model = Model.from_pretrained(model_path,
+                                load_in_4bit=True,
+                                optimize_model=False,
+                                trust_remote_code=True)
+    logits_base_model = (model(input_ids)).logits
+
+    model = Model.from_pretrained(model_path,
+                                load_in_4bit=True,
+                                optimize_model=True,
+                                trust_remote_code=True)
+    logits_optimized_model = (model(input_ids)).logits
+    diff = abs(logits_base_model - logits_optimized_model).flatten()
+
+    assert any(diff) is False
+
+
+if __name__ == '__main__':
+    pytest.main([__file__])
diff --git a/python/llm/test/run-llm-inference-tests-436.sh b/python/llm/test/run-llm-inference-tests-436.sh
new file mode 100644
index 00000000000..60d6950ad6a
--- /dev/null
+++ b/python/llm/test/run-llm-inference-tests-436.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
+export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
+export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference
+
+set -e
+
+echo "# Start testing inference"
+start=$(date "+%s")
+
+python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_call_models.py -v
+
+if [ -z "$THREAD_NUM" ]; then
+  THREAD_NUM=2
+fi
+export OMP_NUM_THREADS=$THREAD_NUM
+python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_436.py -v
+python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_optimize_model_api.py -v
+
+now=$(date "+%s")
+time=$((now-start))
+
+echo "Bigdl-llm tests finished"
+echo "Time used:$time seconds"
diff --git a/python/llm/test/run-llm-inference-tests-gpu-436.sh b/python/llm/test/run-llm-inference-tests-gpu-436.sh
new file mode 100644
index 00000000000..5e48c0df876
--- /dev/null
+++ b/python/llm/test/run-llm-inference-tests-gpu-436.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
+export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
+export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference_gpu
+
+export USE_XETLA=OFF
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export DEVICE='xpu'
+
+set -e
+
+echo "# Start testing inference"
+start=$(date "+%s")
+
+# if [ -z "$THREAD_NUM" ]; then
+#   THREAD_NUM=2
+# fi
+# export OMP_NUM_THREADS=$THREAD_NUM
+pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v -s
+pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_layernorm.py -v -s
+export BIGDL_LLM_XMX_DISABLED=1
+pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_final_logits.py -v -s
+pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s
+pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s
+pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s
+unset BIGDL_LLM_XMX_DISABLED
+
+now=$(date "+%s")
+time=$((now-start))
+
+echo "Bigdl-llm gpu inference tests finished"
+echo "Time used:$time seconds"
+
+echo "# Start testing layers.fast_rope_embedding"
+start=$(date "+%s")
+
+pytest ${LLM_INFERENCE_TEST_DIR}/test_layer_fast_rope.py -v -s
+
+now=$(date "+%s")
+time=$((now-start))
+
+echo "Bigdl-llm gpu layers.fast_rope_embedding tests finished"
+echo "Time used:$time seconds"