diff --git a/.github/actions/ppml/ppml-occlum-EDMM-exampletests-action/action.yml b/.github/actions/ppml/ppml-occlum-EDMM-exampletests-action/action.yml
new file mode 100644
index 00000000000..28e251fb22d
--- /dev/null
+++ b/.github/actions/ppml/ppml-occlum-EDMM-exampletests-action/action.yml
@@ -0,0 +1,192 @@
+name: 'Run PPML Occlum EDMM ExampleTests'
+description: 'Run PPML Occlum EDMM ExampleTests'
+inputs:
+  image:
+    description: 'image'
+    required: true
+    default: '10.239.45.10/arda/intelanalytics/bigdl-ppml-trusted-big-data-ml-scala-occlum'
+  image-tag:
+    description: 'image tag'
+    required: true
+    default: '2.4.0-SNAPSHOT-EDMM6'
+runs:
+  using: "composite"
+  steps:
+    - name: Run tests
+      shell: bash
+      env:
+        DEFAULT_IMAGE: ${{ inputs.image }}:${{ inputs.image-tag }}
+      run: |
+        whoami
+
+        # icx-6's kernel support EDMM
+        export LOCAL_IP=172.168.0.210
+        export CPUSET="6-10"
+        export CONTAINER_NAME="spark-occlum-edmm-jenkins"
+
+        export DATA_PATH=/home/icx/glorysdj/data
+        export KEYS_PATH=/home/icx/glorysdj/keys
+        export SECURE_PASSWORD_PATH=/home/icx/glorysdj/password
+        export SGX_MEM_SIZE=30GB
+        export SGX_KERNEL_HEAP=2GB
+        export IMAGE=${{ env.DEFAULT_IMAGE }}
+
+        docker pull $IMAGE
+        docker pull intelanalytics/bigdl-ppml-trusted-big-data-ml-scala-occlum:2.4.0-SNAPSHOT-EDMM
+
+        docker stop $CONTAINER_NAME
+        docker rm -f $CONTAINER_NAME
+
+        docker run -itd \
+            --net=host \
+            --cpuset-cpus=$CPUSET \
+            --oom-kill-disable \
+            --device=/dev/sgx/enclave \
+            --device=/dev/sgx/provision \
+            -v /var/run/aesmd/aesm.socket:/var/run/aesmd/aesm.socket \
+            -v $DATA_PATH:/opt/occlum_spark/data \
+            -v $KEYS_PATH:/opt/keys \
+            --name=$CONTAINER_NAME \
+            -e LOCAL_IP=$LOCAL_IP \
+            -e SGX_MEM_SIZE=$SGX_MEM_SIZE \
+            -e SGX_KERNEL_HEAP=$SGX_KERNEL_HEAP \
+            $IMAGE \
+            bash  -c "tail -f /dev/null"
+
+        status_1_spark_pi=1
+        status_2_bigdl_lenet_mnist=1
+        status_3_bigdl_resnet_cifar10=1
+        status_4_spark_tpch=1
+        status_5_spark_ut=0
+        status_6_spark_xgboost=1
+        status_7_spark_gbt=1
+        status_8_pyspark_sklearn=1
+        status_9_pyspark_sql=1
+        status_10_pyspark_tpch=1
+        status_11_spark_lgbm=1
+
+        if [ $status_1_spark_pi -ne 0 ]; then
+        echo "################## start spark pi"
+        echo "example.1 spark pi"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             bash run_spark_on_occlum_glibc.sh pi | tee test-spark-pi-sgx.log && \
+             cat test-spark-pi-sgx.log | egrep 'Pi is roughly 3'"
+        status_1_spark_pi=$(echo $?)
+        fi
+
+        if [ $status_2_bigdl_lenet_mnist -ne 0 ]; then
+        echo "################## start bigdl lenet mnist"
+        echo "example.2 bigdl lenet mnist"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             sed -i 's#        run_spark_lenet_mnist#        run_spark_lenet_mnist -b 4 -e 1#g' run_spark_on_occlum_glibc.sh && \
+             sed -i 's#                -f /host/data#                -f /host/data/lenet#g' run_spark_on_occlum_glibc.sh && \
+             bash run_spark_on_occlum_glibc.sh lenet -b 8 -e 1 | tee bigdl-lenet-mnist.log && \
+             cat bigdl-lenet-mnist.log | egrep 'records/second. Loss is' && \
+             sed -i 's#                -f /host/data/lenet#                -f /host/data#g' run_spark_on_occlum_glibc.sh"
+        status_2_bigdl_lenet_mnist=$(echo $?)
+        fi
+
+        if [ $status_3_bigdl_resnet_cifar10 -ne 0 ]; then
+        echo "################## start bigdl resnet cifar10"
+        echo "example.3 bigdl resnet cifar10"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             sed -i 's#        run_spark_resnet_cifar#        run_spark_resnet_cifar --nEpochs 1#g' run_spark_on_occlum_glibc.sh && \
+             sed -i 's#                -f /host/data#                -f /host/data/cifar#g' run_spark_on_occlum_glibc.sh && \
+             bash run_spark_on_occlum_glibc.sh resnet | tee bigdl-resnet-cifar10.log && \
+             cat bigdl-resnet-cifar10.log | egrep 'Current momentum is '&& \
+             sed -i 's#                -f /host/data/cifar#                -f /host/data#g' run_spark_on_occlum_glibc.sh"
+        status_3_bigdl_resnet_cifar10=$(echo $?)
+        fi
+
+        if [ $status_4_spark_tpch -ne 0 ]; then
+        echo "################## start spark tpch"
+        echo "example.4 spark tpch"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             sed -i 's#spark.driver.memory=12g#spark.driver.memory=2g#g' run_spark_on_occlum_glibc.sh && \
+             sed -i 's#spark.executor.instances=8#spark.executor.instances=2#g' run_spark_on_occlum_glibc.sh && \
+             sed -i 's#executor-memory 8G#executor-memory 2G#g' run_spark_on_occlum_glibc.sh && \
+             sed -i 's#-Xmx78g -Xms78g#-Xmx10g -Xms10g#g' run_spark_on_occlum_glibc.sh && \
+             sed -i 's#/host/data /host/data/output#/host/data/tpch /host/data/output#g' run_spark_on_occlum_glibc.sh && \
+             bash run_spark_on_occlum_glibc.sh tpch | tee spark-tpch.log && \
+             cat spark-tpch.log | egrep '22 finished-'"
+        status_4_spark_tpch=$(echo $?)
+        fi
+
+        if [ $status_5_spark_ut -ne 0 ]; then
+        echo "################## start spark unit test"
+        echo "example.5 spark unit test"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             sed -i 's#192.168.0.111#$LOCAL_IP#g' run_spark_on_occlum_glibc.sh && \
+             bash run_spark_on_occlum_glibc.sh ut | tee spark-unit-test.log && \
+             cat spark-unit-test.log | egrep 'FINISHED o.a.s.status.api.v1.sql.SqlResourceSuite:'"
+        status_5_spark_ut=$(echo $?)
+        fi
+
+        if [ $status_6_spark_xgboost -ne 0 ]; then
+        echo "################## start spark xgboost"
+        echo "example.6 spark xgboost"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+          sed -i 's#-i /host/data -s /host/data/model -t 2 -r 100 -d 2 -w 1#-i /host/data/xgboost -s /host/data/xgboost/model -t 2 -r 10 -d 2 -w 1#g' run_spark_on_occlum_glibc.sh && \
+          bash run_spark_on_occlum_glibc.sh xgboost | tee spark-xgboost.log && \
+          cat spark-xgboost.log | egrep 'end time is'"
+        status_6_spark_xgboost=$(echo $?)
+        fi
+
+        if [ $status_7_spark_gbt -ne 0 ]; then
+        echo "################## start spark gbt"
+        echo "example.7 spark gbt"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+          sed -i 's#-i /host/data -s /host/data/model -I 100 -d 5#-i /host/data/gbt -s /host/data/gbt/model -I 10 -d 5#g' run_spark_on_occlum_glibc.sh && \
+          bash run_spark_on_occlum_glibc.sh gbt | tee spark-gbt.log && \
+          cat spark-gbt.log | egrep 'end time is'"
+        status_7_spark_gbt=$(echo $?)
+        fi
+        
+        if [ $status_8_pyspark_sklearn -ne 0 ]; then
+        echo "################## start pyspark sklearn Linear Regression"
+        echo "example.8 pyspark sklearn"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             bash run_spark_on_occlum_glibc.sh pysklearn | tee test-pyspark-sklearn-sgx.log && \
+             cat test-pyspark-sklearn-sgx.log | egrep 'mean_squared_error'"
+        status_8_pyspark_sklearn=$(echo $?)
+        fi
+        
+        if [ $status_9_pyspark_sql -ne 0 ]; then
+        echo "################## start pyspark SQL example"
+        echo "example.9 pyspark sql"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             bash run_spark_on_occlum_glibc.sh pysql | tee test-pyspark-sql-sgx.log && \
+             cat test-pyspark-sql-sgx.log | egrep 'Example API finished'"
+        status_9_pyspark_sql=$(echo $?)
+        fi
+        
+        if [ $status_10_pyspark_tpch -ne 0 ]; then
+        echo "################## start pyspark tpch"
+        echo "example.10 pyspark tpch"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             sed -i 's#/host/data/ /host/data/output/ true#/host/data/tpch/ /host/data/output/ false#g' run_spark_on_occlum_glibc.sh && \
+             bash run_spark_on_occlum_glibc.sh pytpch | tee pyspark-tpch.log && \
+             cat pyspark-tpch.log | egrep 'total time is'"
+        status_10_pyspark_tpch=$(echo $?)
+        fi
+        
+        if [ $status_11_spark_lgbm -ne 0 ]; then
+        echo "################## start spark lgbm"
+        echo "example.11 spark lgbm"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             bash run_spark_on_occlum_glibc.sh lgbm | tee spark-lgbm.log && \
+             cat spark-lgbm.log | egrep 'acc:'"
+        status_11_spark_lgbm=$(echo $?)
+        fi
+
+        echo "status_1_spark_pi $status_1_spark_pi"
+        echo "status_2_bigdl_lenet_mnist $status_2_bigdl_lenet_mnist"
+        echo "status_3_bigdl_resnet_cifar10 $status_3_bigdl_resnet_cifar10"
+        echo "status_4_spark_tpch $status_4_spark_tpch"
+        #echo "status_5_spark_ut $status_5_spark_ut"
+        echo "status_6_spark_xgboost $status_6_spark_xgboost"
+        echo "status_7_spark_gbt $status_7_spark_gbt"
+        echo "status_8_pyspark_sklearn $status_8_pyspark_sklearn"
+        echo "status_9_pyspark_sql $status_9_pyspark_sql"
+        echo "status_10_pyspark_tpch $status_10_pyspark_tpch"
+        echo "status_11_spark_lgbm $status_11_spark_lgbm"
diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 93ab24838b2..815f493239a 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -36,6 +36,10 @@ jobs:
     env:
       THREAD_NUM: 24
     steps:
+      - name: Set environment variables
+        shell: bash
+        run: |
+          echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v2
@@ -55,6 +59,14 @@ jobs:
         env:
           ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
 
+      - name: Download LLMs
+        shell: bash
+        run: |
+          if [ ! -d $LLAMA2_7B_ORIGIN_PATH ]; then
+            echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR
+          fi
+
       - name: Run LLM Performance test
         env:
           ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
@@ -76,10 +88,6 @@ jobs:
       THREAD_NUM: 16
       ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
     steps:
-      - name: Set model directories
-        shell: bash
-        run: |
-          echo "ORIGIN_DIR=/mnt/disk1/models" >> "$GITHUB_ENV"
       - name: Set environment variables
         shell: bash
         run: |
@@ -87,6 +95,7 @@ jobs:
           echo "LLAMA2_13B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-13b-chat-hf" >> "$GITHUB_ENV"
           echo "CHATGLM2_6B_ORIGIN_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
           echo "WHISPER_MEDIUM_ORIGIN_PATH=${ORIGIN_DIR}/whisper-medium" >> "$GITHUB_ENV"
+
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
@@ -112,6 +121,27 @@ jobs:
         run: |
           source /opt/intel/oneapi/setvars.sh
           bash python/llm/test/run-llm-install-tests.sh
+
+      - name: Download LLMs
+        shell: bash
+        run: |
+          if [ ! -d $LLAMA2_7B_ORIGIN_PATH ]; then
+            echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR
+          fi
+          if [ ! -d $LLAMA2_13B_ORIGIN_PATH ]; then
+            echo "Directory $LLAMA2_13B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-13b-chat-hf -P $ORIGIN_DIR
+          fi
+          if [ ! -d $CHATGLM2_6B_ORIGIN_PATH ]; then
+            echo "Directory $CHATGLM2_6B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
+          fi
+          if [ ! -d $WHISPER_MEDIUM_ORIGIN_PATH ]; then
+            echo "Directory $WHISPER_MEDIUM_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/whisper-medium -P $ORIGIN_DIR
+          fi
+
       - name: Test on xpu
         shell: bash
         run: |
diff --git a/.github/workflows/nightly_test.yml b/.github/workflows/nightly_test.yml
index 0daf89a3272..d316a9ce287 100644
--- a/.github/workflows/nightly_test.yml
+++ b/.github/workflows/nightly_test.yml
@@ -3,7 +3,7 @@ name: Nightly Test
 on:
 
   #pull_request:
-    #branches: [ main ]
+  #  branches: [ main ]
 
   schedule:
     - cron: '30 15 * * *' # GMT time, 15:30 GMT == 23:30 China
@@ -68,6 +68,7 @@ on:
         - PPML-Scala-UT
         - PPML-Python-UT-Spark3
         - PPML-Occlum-ExampleTests
+        - PPML-Occlum-EDMM-ExampleTests
         - PPML-spark-Local-SimpleQuery-Tests-on-Gramine
         - PPML-RealTime-ML-Occlum
         - PPML-RealTime-ML-Occlum-K8s
@@ -1402,6 +1403,40 @@ jobs:
         job-name: PPML-Occlum-ExampleTests
         runner-hosted-on: 'Shanghai'
 
+  PPML-Occlum-EDMM-ExampleTests:
+    if: ${{ github.event.inputs.artifact == 'PPML-Occlum-EDMM-ExampleTests' }}
+    runs-on: [self-hosted, EDMM]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up JDK8
+        uses: ./.github/actions/jdk-setup-action
+      - name: Set up maven
+        uses: ./.github/actions/maven-setup-action
+      - name: set env
+        env:
+          DEFAULT_IMAGE: '10.239.45.10/arda/intelanalytics/bigdl-ppml-trusted-big-data-ml-scala-occlum'
+          DEFAULT_TAG: '2.4.0-SNAPSHOT-EDMM'
+        run: |
+          echo "TAG=${{ github.event.inputs.tag || env.DEFAULT_TAG }}" >> $GITHUB_ENV
+          echo "IMAGE=${{ github.event.inputs.image || env.DEFAULT_IMAGE }}" >> $GITHUB_ENV
+      - name: Run Test
+        uses: ./.github/actions/ppml/ppml-occlum-EDMM-exampletests-action
+        with:
+          image: ${{env.IMAGE}}
+          image-tag: ${{env.TAG}}
+      - name: Create Job Badge
+        uses: ./.github/actions/create-job-status-badge
+        if: ${{ always() }}
+        with:
+          secret: ${{ secrets.GIST_SECRET}}
+          gist-id: ${{env.GIST_ID}}
+          is-self-hosted-runner: true
+          file-name: PPML-Occlum-EDMM-ExampleTests.json
+          type: job
+          job-name: PPML-Occlum-EDMM-ExampleTests
+          runner-hosted-on: 'Shanghai'
+
   PPML-RealTime-ML-Occlum:
     if: ${{ github.event.schedule || github.event.inputs.artifact == 'PPML-RealTime-ML-Occlum' || github.event.inputs.artifact == 'all' }} 
     runs-on: [self-hosted, Vilvarin]
diff --git a/README.md b/README.md
index 841d9bcd445..3b6373373d7 100644
--- a/README.md
+++ b/README.md
@@ -12,8 +12,8 @@
 > *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [ggml](https://github.com/ggerganov/ggml), [gptq](https://github.com/IST-DASLab/gptq), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.*
 
 ### Latest update
-- **[New]** `bigdl-llm` now supports QLoRA fintuning on Intel GPU; see the the example [here](python/llm/example/gpu/qlora_finetuning).
-- `bigdl-llm` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples [here](python/llm/example/gpu).
+- **[New]** `bigdl-llm` now supports QLoRA fintuning on Intel GPU; see the the example [here](python/llm/example/GPU/QLoRA-FineTuning).
+- `bigdl-llm` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples [here](python/llm/example/GPU).
 - `bigdl-llm` tutorial is released [here](https://github.com/intel-analytics/bigdl-llm-tutorial).
 - Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models).
      
@@ -76,7 +76,7 @@ input_ids = tokenizer.encode(input_str, ...)
 output_ids = model.generate(input_ids, ...)
 output = tokenizer.batch_decode(output_ids)
 ```
-*See the complete examples [here](python/llm/example/transformers/transformers_int4/).*
+*See the complete examples [here](python/llm/example/CPU/HF-Transformers-AutoModels/Model).*
 
 #### GPU INT4
 ##### Install
@@ -105,7 +105,7 @@ input_ids = tokenizer.encode(input_str, ...).to('xpu')
 output_ids = model.generate(input_ids, ...)
 output = tokenizer.batch_decode(output_ids.cpu())
 ```
-*See the complete examples [here](python/llm/example/gpu/).*
+*See the complete examples [here](python/llm/example/GPU).*
 
 #### More Low-Bit Support
 ##### Save and load
@@ -115,7 +115,7 @@ After the model is optimized using `bigdl-llm`, you may save and load the model
 model.save_low_bit(model_path)
 new_model = AutoModelForCausalLM.load_low_bit(model_path)
 ```
-*See the complete example [here](python/llm/example/transformers/transformers_low_bit/).*
+*See the complete example [here](python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load).*
 
 ##### Additonal data types
  
@@ -123,7 +123,7 @@ In addition to INT4, You may apply other low bit optimizations (such as *INT8*,
 ```python
 model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_low_bit="sym_int8")
 ```
-*See the complete example [here](python/llm/example/transformers/transformers_low_bit/).*
+*See the complete example [here](python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types).*
 
 
 ***For more details, please refer to the `bigdl-llm` [Document](https://test-bigdl-llm.readthedocs.io/en/main/doc/LLM/index.html), [Readme](python/llm), [Tutorial](https://github.com/intel-analytics/bigdl-llm-tutorial) and [API Doc](https://bigdl.readthedocs.io/en/latest/doc/PythonAPI/LLM/index.html).***
diff --git a/docker/llm/finetune/lora/README.md b/docker/llm/finetune/lora/README.md
deleted file mode 100644
index 98b694cfb21..00000000000
--- a/docker/llm/finetune/lora/README.md
+++ /dev/null
@@ -1,112 +0,0 @@
-## Run BF16-Optimized Lora Finetuning on Kubernetes with OneCCL
-
-[Alpaca Lora](https://github.com/tloen/alpaca-lora/tree/main) uses [low-rank adaption](https://arxiv.org/pdf/2106.09685.pdf) to speed up the finetuning process of base model [Llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b), and tries to reproduce the standard Alpaca, a general finetuned LLM. This is on top of Hugging Face transformers with Pytorch backend, which natively requires a number of expensive GPU resources and takes significant time.
-
-By constract, BigDL here provides a CPU optimization to accelerate the lora finetuning of Llama2-7b, in the power of mixed-precision and distributed training. Detailedly, [Intel OneCCL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html), an available Hugging Face backend, is able to speed up the Pytorch computation with BF16 datatype on CPUs, as well as parallel processing on Kubernetes enabled by [Intel MPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html). 
-
-The architecture is illustrated in the following:
-
-![image](https://github.com/Jasonzzt/BigDL/assets/60865256/b66416bc-ad07-49af-8cb0-8967dffb5f58)
-
-As above, BigDL implements its MPI training build on [Kubeflow MPI operator](https://github.com/kubeflow/mpi-operator/tree/master), which encapsulates the deployment as MPIJob CRD, and assists users to handle the construction of a MPI worker cluster on Kubernetes, such as public key distribution, SSH connection, and log collection. 
-
-Now, let's go to deploy a Lora finetuning to create a LLM from Llama2-7b.
-
-**Note: Please make sure you have already have an available Kubernetes infrastructure and NFS shared storage, and install [Helm CLI](https://helm.sh/docs/helm/helm_install/) for Kubernetes job submission.**
-
-### 1. Install Kubeflow MPI Operator
-
-Follow [here](https://github.com/kubeflow/mpi-operator/tree/master#installation) to install a Kubeflow MPI operator in your Kubernetes, which will listen and receive the following MPIJob request at backend.
-
-### 2. Download Image, Base Model and Finetuning Data
-
-Follow [here](https://github.com/intel-analytics/BigDL/tree/main/docker/llm/finetune/lora/docker#prepare-bigdl-image-for-lora-finetuning) to prepare BigDL Lora Finetuning image in your cluster.
-
-As finetuning is from a base model, first download [Llama2-7b model from the public download site of Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b). Then, download [cleaned alpaca data](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json), which contains all kinds of general knowledge and has already been cleaned. Next, move the downloaded files to a shared directory on your NFS server.
-
-### 3. Deploy through Helm Chart
-
-You are allowed to edit and experiment with different parameters in `./kubernetes/values.yaml` to improve finetuning performance and accuracy. For example, you can adjust `trainerNum` and `cpuPerPod` according to node and CPU core numbers in your cluster to make full use of these resources, and different `microBatchSize` result in different training speed and loss (here note that `microBatchSize`×`trainerNum` should not more than 128, as it is the batch size).
-
-**Note: `dataSubPath` and `modelSubPath` need to have the same names as files under the NFS directory in step 2.**
-
-After preparing parameters in `./kubernetes/values.yaml`, submit the job as beflow:
-
-```bash
-cd ./kubernetes
-helm install bigdl-lora-finetuning .
-```
-
-### 4. Check Deployment
-```bash
-kubectl get all -n bigdl-lora-finetuning # you will see launcher and worker pods running
-```
-
-### 5. Check Finetuning Process
-
-After deploying successfully, you can find a launcher pod, and then go inside this pod and check the logs collected from all workers.
-
-```bash
-kubectl get all -n bigdl-lora-finetuning # you will see a launcher pod
-kubectl exec -it <launcher_pod_name> bash -n bigdl-ppml-finetuning # enter launcher pod
-cat launcher.log # display logs collected from other workers
-```
-
-From the log, you can see whether finetuning process has been invoked successfully in all MPI worker pods, and a progress bar with finetuning speed and estimated time will be showed after some data preprocessing steps (this may take quiet a while).
-
-For the fine-tuned model, it is written by the worker 0 (who holds rank 0), so you can find the model output inside the pod, which can be saved to host by command tools like `kubectl cp` or `scp`.
-
-
-## To run in TDX-CoCo and enable Remote Attestation API
-
-You can deploy this workload in TDX CoCo and enable Remote Attestation API Serving with setting `TEEMode` in `./kubernetes/values.yaml` to `tdx`. The main diffences are it's need to execute the pods as root and mount TDX device, and a flask service is responsible for generating launcher's quote and collecting workers' quotes. 
-
-### (Optional) Enable TLS
-To enable TLS in Remote Attestation API Serving, you should provide a TLS certificate and setting `enableTLS` ( to `true` ), `base64ServerCrt` and `base64ServerKey` in `./kubernetes/values.yaml`.
-```bash
-# Generate a self-signed TLS certificate (DEBUG USE ONLY)
-export COUNTRY_NAME=your_country_name
-export CITY_NAME=your_city_name
-export ORGANIZATION_NAME=your_organization_name
-export COMMON_NAME=your_common_name
-export EMAIL_ADDRESS=your_email_address
-
-openssl req -x509 -newkey rsa:4096 -nodes -out server.crt -keyout server.key -days 365 -subj "/C=$COUNTRY_NAME/ST=$CITY_NAME/L=$CITY_NAME/O=$ORGANIZATION_NAME/OU=$ORGANIZATION_NAME/CN=$COMMON_NAME/emailAddress=$EMAIL_ADDRESS/"
-
-# Calculate Base64 format string in values.yaml
-cat server.crt | base64 -w 0 # Set in base64ServerCrt
-cat server.key | base64 -w 0 # Set in base64ServerKey
-```
-
-To use RA Rest API, you need to get the IP of job-launcher:
-``` bash
-kubectl get all -n bigdl-lora-finetuning 
-```
-You will find a line like:
-```bash
-service/bigdl-lora-finetuning-launcher-attestation-api-service   ClusterIP   10.109.87.248   <none>        9870/TCP   17m
-```
-Here are IP and port of the Remote Attestation API service.
-
-The RA Rest API are listed below:
-### 1. Generate launcher's quote
-```bash
-curl -X POST -H "Content-Type: application/json" -d '{"user_report_data": "<your_user_report_data>"}' http://<your_ra_api_service_ip>:<your_ra_api_service_port>/gen_quote
-```
-
-Example responce:
-
-```json
-{"quote":"BAACAIEAAAAAAAA..."}
-```
-### 2. Collect all cluster components' quotes (launcher and workers)
-```bash
-curl -X POST -H "Content-Type: application/json" -d '{"user_report_data": "<your_user_report_data>"}' http://<your_ra_api_service_ip>:<your_ra_api_service_port>/attest
-```
-
-Example responce:
-
-```json
-{"quote_list":{"bigdl-lora-finetuning-job-worker-0":"BAACAIEAAAAAAA...","bigdl-lora-finetuning-job-worker-1":"BAACAIEAAAAAAA...","launcher":"BAACAIEAAAAAA..."}}
-```
-
diff --git a/docker/llm/finetune/lora/cpu/docker/README.md b/docker/llm/finetune/lora/cpu/docker/README.md
index e988f8f049d..be86f2b22c9 100644
--- a/docker/llm/finetune/lora/cpu/docker/README.md
+++ b/docker/llm/finetune/lora/cpu/docker/README.md
@@ -3,7 +3,7 @@
 You can download directly from Dockerhub like:
 
 ```bash
-docker pull intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT
+docker pull intelanalytics/bigdl-llm-finetune-lora-cpu:2.4.0-SNAPSHOT
 ```
 
 Or build the image from source:
@@ -15,6 +15,6 @@ export HTTPS_PROXY=your_https_proxy
 docker build \
   --build-arg http_proxy=${HTTP_PROXY} \
   --build-arg https_proxy=${HTTPS_PROXY} \
-  -t intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT \
+  -t intelanalytics/bigdl-llm-finetune-lora-cpu:2.4.0-SNAPSHOT \
   -f ./Dockerfile .
 ```
diff --git a/docker/llm/finetune/lora/cpu/kubernetes/values.yaml b/docker/llm/finetune/lora/cpu/kubernetes/values.yaml
index 92a5f5e0b1b..8c3b9db2706 100644
--- a/docker/llm/finetune/lora/cpu/kubernetes/values.yaml
+++ b/docker/llm/finetune/lora/cpu/kubernetes/values.yaml
@@ -1,4 +1,4 @@
-imageName: intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT
+imageName: intelanalytics/bigdl-llm-finetune-lora-cpu:2.4.0-SNAPSHOT
 trainerNum: 8
 microBatchSize: 8
 nfsServerIp: your_nfs_server_ip
diff --git a/docker/llm/finetune/qlora/xpu/docker/README.md b/docker/llm/finetune/qlora/xpu/docker/README.md
index 201dadf29ad..368fd52f2e8 100644
--- a/docker/llm/finetune/qlora/xpu/docker/README.md
+++ b/docker/llm/finetune/qlora/xpu/docker/README.md
@@ -28,14 +28,18 @@ docker build \
 Here, we try to fine-tune a [Llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b) with [English Quotes](https://huggingface.co/datasets/Abirate/english_quotes) dataset, and please download them and start a docker container with files mounted like below:
 
 ```bash
-export BASE_MODE_PATH=<your_downloaded_base_model_path>
-export DATA_PATH=<your_downloaded_data_path>
+export BASE_MODE_PATH=your_downloaded_base_model_path
+export DATA_PATH=your_downloaded_data_path
+export HTTP_PROXY=your_http_proxy
+export HTTPS_PROXY=your_https_proxy
 
 docker run -itd \
    --net=host \
    --device=/dev/dri \
    --memory="32G" \
    --name=bigdl-llm-fintune-qlora-xpu \
+   -e http_proxy=${HTTP_PROXY} \
+   -e https_proxy=${HTTPS_PROXY} \
    -v $BASE_MODE_PATH:/model \
    -v $DATA_PATH:/data/english_quotes \
    --shm-size="16g" \
@@ -45,11 +49,16 @@ docker run -itd \
 The download and mount of base model and data to a docker container demonstrates a standard fine-tuning process. You can skip this step for a quick start, and in this way, the fine-tuning codes will automatically download the needed files:
 
 ```bash
+export HTTP_PROXY=your_http_proxy
+export HTTPS_PROXY=your_https_proxy
+
 docker run -itd \
    --net=host \
    --device=/dev/dri \
    --memory="32G" \
    --name=bigdl-llm-fintune-qlora-xpu \
+   -e http_proxy=${HTTP_PROXY} \
+   -e https_proxy=${HTTPS_PROXY} \
    --shm-size="16g" \
    intelanalytics/bigdl-llm-fintune-qlora-xpu:2.4.0-SNAPSHOT
 ```
diff --git a/docker/llm/inference/xpu/docker/Dockerfile b/docker/llm/inference/xpu/docker/Dockerfile
index 0b7da551b12..92dc893bc2f 100644
--- a/docker/llm/inference/xpu/docker/Dockerfile
+++ b/docker/llm/inference/xpu/docker/Dockerfile
@@ -8,7 +8,9 @@ ENV TZ=Asia/Shanghai
 # Disable pip's cache behavior
 ARG PIP_NO_CACHE_DIR=false
 
-RUN apt-get update && \
+RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " > /etc/apt/sources.list.d/oneAPI.list && \
+    apt-get update && \
     apt-get install -y curl wget git gnupg gpg-agent && \
     wget -qO - https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg && \
     echo 'deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc' | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
diff --git a/docker/llm/serving/cpu/kubernetes/README.md b/docker/llm/serving/cpu/kubernetes/README.md
index b0027f127b2..d5394d2943d 100644
--- a/docker/llm/serving/cpu/kubernetes/README.md
+++ b/docker/llm/serving/cpu/kubernetes/README.md
@@ -15,6 +15,8 @@ After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna
 
 You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5).
 
+For ChatGLM models, users do not need to add `bigdl` into model path.  We have already used the `BigDL-LLM` backend for this model. 
+
 ### Kubernetes config
 
 We recommend to setup your kubernetes cluster before deployment.  Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/).  Also, it would be great to also set the `topology management policy` to `single-numa-node`.
diff --git a/ppml/tdx/docker/trusted-bigdl-llm/inference/requirements.txt b/ppml/tdx/docker/trusted-bigdl-llm/inference/requirements.txt
index a162d91c0c6..9339e8b3146 100644
--- a/ppml/tdx/docker/trusted-bigdl-llm/inference/requirements.txt
+++ b/ppml/tdx/docker/trusted-bigdl-llm/inference/requirements.txt
@@ -14,7 +14,7 @@ fastapi==0.95.2
 pydantic==1.10.8
 
 ### document qa
-langchain==0.0.246
+langchain==0.0.308
 pypdf
 chromadb==0.3.25
 
diff --git a/python/llm/README.md b/python/llm/README.md
index bb19f43b14e..63cd54fab9e 100644
--- a/python/llm/README.md
+++ b/python/llm/README.md
@@ -40,23 +40,24 @@ Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLa
     
 | Model     | Example                                                  |
 |-----------|----------------------------------------------------------|
-| LLaMA *(such as Vicuna, Guanaco, Koala, Baize, WizardLM, etc.)* | [link1](example/transformers/native_int4), [link2](example/transformers/transformers_int4/vicuna)    |
-| LLaMA 2   | [link](example/transformers/transformers_int4/llama2)    |
-| MPT       | [link](example/transformers/transformers_int4/mpt)       |
-| Falcon    | [link](example/transformers/transformers_int4/falcon)    |
-| ChatGLM   | [link](example/transformers/transformers_int4/chatglm)   | 
-| ChatGLM2  | [link](example/transformers/transformers_int4/chatglm2)  | 
-| Qwen      | [link](example/transformers/transformers_int4/qwen)      |
-| MOSS      | [link](example/transformers/transformers_int4/moss)      | 
-| Baichuan  | [link](example/transformers/transformers_int4/baichuan)  | 
-| Baichuan2 | [link](example/transformers/transformers_int4/baichuan2) |
-| Dolly-v1  | [link](example/transformers/transformers_int4/dolly_v1)  | 
-| Dolly-v2  | [link](example/transformers/transformers_int4/dolly_v2)  | 
-| RedPajama | [link1](example/transformers/native_int4), [link2](example/transformers/transformers_int4/redpajama) | 
-| Phoenix   | [link1](example/transformers/native_int4), [link2](example/transformers/transformers_int4/phoenix)   | 
-| StarCoder | [link1](example/transformers/native_int4), [link2](example/transformers/transformers_int4/starcoder) | 
-| InternLM  | [link](example/transformers/transformers_int4/internlm)  |
-| Whisper   | [link](example/transformers/transformers_int4/whisper)   |
+| LLaMA *(such as Vicuna, Guanaco, Koala, Baize, WizardLM, etc.)* | [link1](example/CPU/Native-Models), [link2](example/CPU/HF-Transformers-AutoModels/Model/vicuna)    |
+| LLaMA 2   | [link](example/CPU/HF-Transformers-AutoModels/Model/llama2)    |
+| MPT       | [link](example/CPU/HF-Transformers-AutoModels/Model/mpt)       |
+| Falcon    | [link](example/CPU/HF-Transformers-AutoModels/Model/falcon)    |
+| ChatGLM   | [link](example/CPU/HF-Transformers-AutoModels/Model/chatglm)   | 
+| ChatGLM2  | [link](example/CPU/HF-Transformers-AutoModels/Model/chatglm2)  | 
+| Qwen      | [link](example/CPU/HF-Transformers-AutoModels/Model/qwen)      |
+| MOSS      | [link](example/CPU/HF-Transformers-AutoModels/Model/moss)      | 
+| Baichuan  | [link](example/CPU/HF-Transformers-AutoModels/Model/baichuan)  | 
+| Baichuan2 | [link](example/CPU/HF-Transformers-AutoModels/Model/baichuan2) |
+| Dolly-v1  | [link](example/CPU/HF-Transformers-AutoModels/Model/dolly_v1)  | 
+| Dolly-v2  | [link](example/CPU/HF-Transformers-AutoModels/Model/dolly_v2)  | 
+| RedPajama | [link1](example/CPU/Native-Models), [link2](example/CPU/HF-Transformers-AutoModels/Model/redpajama) | 
+| Phoenix   | [link1](example/CPU/Native-Models), [link2](example/CPU/HF-Transformers-AutoModels/Model/phoenix)   | 
+| StarCoder | [link1](example/CPU/Native-Models), [link2](example/CPU/HF-Transformers-AutoModels/Model/starcoder) | 
+| InternLM  | [link](example/CPU/HF-Transformers-AutoModels/Model/internlm)  |
+| Whisper   | [link](example/CPU/HF-Transformers-AutoModels/Model/whisper)   |
+| Aquila    | [link](example/CPU/HF-Transformers-AutoModels/Model/aquila)    |
 
 </details>
 
@@ -119,7 +120,7 @@ output_ids = model.generate(input_ids, ...)
 output = tokenizer.batch_decode(output_ids)
 ```
 
-See the complete examples [here](example/transformers/transformers_int4/).  
+See the complete examples [here](example/CPU/HF-Transformers-AutoModels/Model/).  
 
 ###### GPU INT4
 You may apply INT4 optimizations to any Hugging Face *Transformers* model on Intel GPU as follows.
@@ -138,7 +139,7 @@ input_ids = tokenizer.encode(input_str, ...).to('xpu')
 output_ids = model.generate(input_ids, ...)
 output = tokenizer.batch_decode(output_ids.cpu())
 ```
-See the complete examples [here](example/gpu/).
+See the complete examples [here](example/GPU).
 
 ###### More Low-Bit Support
 - Save and load
@@ -148,7 +149,7 @@ See the complete examples [here](example/gpu/).
   model.save_low_bit(model_path)
   new_model = AutoModelForCausalLM.load_low_bit(model_path)
   ```
-  *See the complete example [here](example/transformers/transformers_low_bit/).*
+  *See the complete example [here](example/CPU/HF-Transformers-AutoModels/Save-Load).*
 
 - Additonal data types
  
@@ -157,7 +158,7 @@ See the complete examples [here](example/gpu/).
   ```python
   model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_low_bit="sym_int8")
   ```
-  *See the complete example [here](example/transformers/transformers_low_bit/).*
+  *See the complete example [here](example/CPU/HF-Transformers-AutoModels/More-Data-Types).*
 
 ##### 2. Native INT4 model
  
@@ -182,7 +183,7 @@ output_ids = llm.generate(input_ids, ...)
 output = llm.batch_decode(output_ids)
 ``` 
 
-See the complete example [here](example/transformers/native_int4/native_int4_pipeline.py). 
+See the complete example [here](example/CPU/Native-Models/native_int4_pipeline.py). 
 
 ##### 3. LangChain API
 You may run the models using the LangChain API in `bigdl-llm`.
@@ -202,7 +203,7 @@ You may run the models using the LangChain API in `bigdl-llm`.
   doc_chain = load_qa_chain(bigdl_llm, ...)
   output = doc_chain.run(...)
   ```
-  See the examples [here](example/langchain/transformers_int4).
+  See the examples [here](example/CPU/LangChain/transformers_int4).
  
 - **Using native INT4 model**
 
@@ -224,7 +225,7 @@ You may run the models using the LangChain API in `bigdl-llm`.
   doc_chain.run(...)
   ```
 
-  See the examples [here](example/langchain/native_int4).
+  See the examples [here](example/CPU/LangChain/native_int4).
 
 ##### 4. CLI Tool
 >**Note**: Currently `bigdl-llm` CLI supports *LLaMA* (e.g., *vicuna*), *GPT-NeoX* (e.g., *redpajama*), *BLOOM* (e.g., *pheonix*) and *GPT2* (e.g., *starcoder*) model architecture; for other models, you may use the Hugging Face `transformers` or LangChain APIs.
diff --git a/python/llm/dev/benchmark/run-benchmark-tests.sh b/python/llm/dev/benchmark/run-benchmark-tests.sh
index 1fa8032e5b4..5ec5c489e18 100644
--- a/python/llm/dev/benchmark/run-benchmark-tests.sh
+++ b/python/llm/dev/benchmark/run-benchmark-tests.sh
@@ -12,11 +12,6 @@ export OMP_NUM_THREADS=$THREAD_NUM
 ######## LLAMA2
 # transformers
 
-if [ ! -d $ORIGINAL_LLAMA2_PATH ]; then
-    echo "Directory $ORIGINAL_LLAMA2_PATH not found. Downloading from FTP server..."
-    wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/${ORIGINAL_LLAMA2_PATH:2} -P $LLM_DIR
-fi
-
 echo ">>> Testing LLAMA2 transformers API"
-taskset -c 0-$((THREAD_NUM - 1)) python python/llm/dev/benchmark/pipelines/llama2_test.py --repo-id-or-model-path $ORIGINAL_LLAMA2_PATH
+taskset -c 0-$((THREAD_NUM - 1)) python python/llm/dev/benchmark/pipelines/llama2_test.py --repo-id-or-model-path $LLAMA2_7B_ORIGIN_PATH
 
diff --git a/python/llm/example/transformers/transformers_int4/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/README.md
similarity index 97%
rename from python/llm/example/transformers/transformers_int4/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/README.md
index 79e23fd1540..497e7e6c209 100644
--- a/python/llm/example/transformers/transformers_int4/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/README.md
@@ -21,6 +21,7 @@ You can use BigDL-LLM to run any Huggingface Transformer models with INT4 optimi
 | InternLM  | [link](internlm)  |
 | Whisper   | [link](whisper)   |
 | Qwen      | [link](qwen)      |
+| Aquila    | [link](aquila)    |
 
 ## Recommended Requirements
 To run the examples, we recommend using Intel® Xeon® processors (server), or >= 12th Gen Intel® Core™ processor (client).
diff --git a/python/llm/example/transformers/transformers_int4/aquila/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/aquila/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/README.md
diff --git a/python/llm/example/transformers/transformers_int4/aquila/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/aquila/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/baichuan/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/baichuan/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/README.md
diff --git a/python/llm/example/transformers/transformers_int4/baichuan/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/baichuan/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/baichuan2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/baichuan2/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/README.md
diff --git a/python/llm/example/transformers/transformers_int4/baichuan2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/baichuan2/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/chatglm/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/chatglm/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/README.md
diff --git a/python/llm/example/transformers/transformers_int4/chatglm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/chatglm/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/chatglm2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/chatglm2/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/README.md
diff --git a/python/llm/example/transformers/transformers_int4/chatglm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/chatglm2/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/chatglm2/streamchat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/chatglm2/streamchat.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
diff --git a/python/llm/example/transformers/transformers_int4/dolly_v1/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/dolly_v1/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/README.md
diff --git a/python/llm/example/transformers/transformers_int4/dolly_v1/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/dolly_v1/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/dolly_v2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/dolly_v2/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/README.md
diff --git a/python/llm/example/transformers/transformers_int4/dolly_v2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/dolly_v2/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/falcon/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/falcon/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/README.md
diff --git a/python/llm/example/transformers/transformers_int4/falcon/falcon-40b-instruct/modelling_RW.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-40b-instruct/modelling_RW.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/falcon/falcon-40b-instruct/modelling_RW.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-40b-instruct/modelling_RW.py
diff --git a/python/llm/example/gpu/hf-transformers-models/falcon/falcon-7b-instruct/modelling_RW.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/falcon/falcon-7b-instruct/modelling_RW.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py
diff --git a/python/llm/example/transformers/transformers_int4/falcon/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/falcon/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/internlm/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/internlm/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/README.md
diff --git a/python/llm/example/transformers/transformers_int4/internlm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/internlm/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/llama2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/llama2/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/README.md
diff --git a/python/llm/example/transformers/transformers_int4/llama2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/llama2/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/moss/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/moss/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/README.md
diff --git a/python/llm/example/transformers/transformers_int4/moss/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/moss/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/mpt/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/mpt/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/README.md
diff --git a/python/llm/example/transformers/transformers_int4/mpt/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/mpt/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/phoenix/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/phoenix/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/README.md
diff --git a/python/llm/example/transformers/transformers_int4/phoenix/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/phoenix/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/qwen/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/qwen/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/README.md
diff --git a/python/llm/example/transformers/transformers_int4/qwen/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/qwen/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/redpajama/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/redpajama/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/README.md
diff --git a/python/llm/example/transformers/transformers_int4/redpajama/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/redpajama/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/starcoder/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/starcoder/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/README.md
diff --git a/python/llm/example/transformers/transformers_int4/starcoder/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/starcoder/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/vicuna/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/vicuna/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/README.md
diff --git a/python/llm/example/transformers/transformers_int4/vicuna/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/vicuna/generate.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
diff --git a/python/llm/example/transformers/transformers_int4/whisper/long-segment-recognize.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/long-segment-recognize.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/whisper/long-segment-recognize.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/long-segment-recognize.py
diff --git a/python/llm/example/transformers/transformers_int4/whisper/readme.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/readme.md
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/whisper/readme.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/readme.md
diff --git a/python/llm/example/transformers/transformers_int4/whisper/recognize.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/whisper/recognize.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
diff --git a/python/llm/example/transformers/transformers_low_bit/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/README.md
similarity index 100%
rename from python/llm/example/transformers/transformers_low_bit/README.md
rename to python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/README.md
diff --git a/python/llm/example/transformers/transformers_low_bit/transformers_low_bit_pipeline.py b/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py
similarity index 100%
rename from python/llm/example/transformers/transformers_low_bit/transformers_low_bit_pipeline.py
rename to python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/README.md
new file mode 100644
index 00000000000..e0cebde5b45
--- /dev/null
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/README.md
@@ -0,0 +1,7 @@
+# Running Hugging Face Transformers model using BigDL-LLM on Intel CPU
+
+This folder contains examples of running any Hugging Face Transformers model on BigDL-LLM (using the standard AutoModel APIs):
+
+- [Model](Model): examples of running Hugging Face Transformers models (e.g., LLaMA2, ChatGLM2, Falcon, MPT, Baichuan2, etc.) using INT4 optimizations
+- [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (NF4/INT5/INT8, etc.)
+- [Save-Load](Save-Load): examples of saving and loading low-bit models
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/README.md
new file mode 100644
index 00000000000..6a992c857a9
--- /dev/null
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/README.md
@@ -0,0 +1,43 @@
+# BigDL-LLM Transformers Low-Bit Inference Pipeline for Large Language Model
+
+In this example, we show a pipeline to apply BigDL-LLM low-bit optimizations (including INT8/INT5/INT4) to any Hugging Face Transformers model, and then run inference on the optimized low-bit model.
+
+## Prepare Environment
+We suggest using conda to manage environment:
+```bash
+conda create -n llm python=3.9
+conda activate llm
+
+pip install --pre --upgrade bigdl-llm[all]
+```
+
+## Run Example
+```bash
+python ./transformers_low_bit_pipeline.py --repo-id-or-model-path decapoda-research/llama-7b-hf --low-bit sym_int5 --save-path ./llama-7b-sym_int5
+```
+arguments info:
+- `--repo-id-or-model-path`: str value, argument defining the huggingface repo id for the large language model to be downloaded, or the path to the huggingface checkpoint folder, the value is 'decapoda-research/llama-7b-hf' by default.
+- `--low-bit`: str value, options are sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8. (sym_int4 means symmetric int 4, asym_int4 means asymmetric int 4, etc.). Relevant low bit optimizations will be applied to the model.
+- `--save-path`: str value, the path to save the low-bit model. Then you can load the low-bit directly.
+- `--load-path`: optional str value. The path to load low-bit model.
+
+
+## Sample Output for Inference
+### 'decapoda-research/llama-7b-hf' Model
+```log
+Prompt: Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun
+Output: Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. She wanted to be a princess, and she wanted to be a pirate. She wanted to be a superhero, and she wanted to be
+Model and tokenizer are saved to ./llama-7b-sym_int5
+```
+
+### Load low-bit model
+Command to run:
+```bash
+python ./transformers_low_bit_pipeline.py --load-path ./llama-7b-sym_int5
+```
+Output log:
+```log
+Prompt: Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun
+Output: Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. She wanted to be a princess, and she wanted to be a pirate. She wanted to be a superhero, and she wanted to be
+```
+
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py
new file mode 100644
index 00000000000..9cf9cffb878
--- /dev/null
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py
@@ -0,0 +1,56 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+from bigdl.llm.transformers import AutoModelForCausalLM
+from transformers import LlamaTokenizer, TextGenerationPipeline
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Transformer save_load example')
+    parser.add_argument('--repo-id-or-model-path', type=str, default="decapoda-research/llama-7b-hf",
+                        help='The huggingface repo id for the large language model to be downloaded'
+                             ', or the path to the huggingface checkpoint folder')
+    parser.add_argument('--low-bit', type=str, default="sym_int4",
+                        choices=['sym_int4', 'asym_int4', 'sym_int5', 'asym_int5', 'sym_int8'],
+                        help='The quantization type the model will convert to.')
+    parser.add_argument('--save-path', type=str, default=None,
+                        help='The path to save the low-bit model.')
+    parser.add_argument('--load-path', type=str, default=None,
+                        help='The path to load the low-bit model.')
+    args = parser.parse_args()
+    model_path = args.repo_id_or_model_path
+    low_bit = args.low_bit
+    load_path = args.load_path
+    if load_path:
+        model = AutoModelForCausalLM.load_low_bit(load_path)
+        tokenizer = LlamaTokenizer.from_pretrained(load_path)
+    else:
+        # load_in_low_bit in bigdl.llm.transformers will convert
+        # the relevant layers in the model into corresponding int X format
+        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
+        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+    pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer, max_new_tokens=32)
+    input_str = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
+    output = pipeline(input_str)[0]["generated_text"]
+    print(f"Prompt: {input_str}")
+    print(f"Output: {output}")
+
+    save_path = args.save_path
+    if save_path:
+        model.save_low_bit(save_path)
+        tokenizer.save_pretrained(save_path)
+        print(f"Model and tokenizer are saved to {save_path}")
diff --git a/python/llm/example/langchain/README.md b/python/llm/example/CPU/LangChain/README.md
similarity index 100%
rename from python/llm/example/langchain/README.md
rename to python/llm/example/CPU/LangChain/README.md
diff --git a/python/llm/example/langchain/native_int4/docqa.py b/python/llm/example/CPU/LangChain/native_int4/docqa.py
similarity index 100%
rename from python/llm/example/langchain/native_int4/docqa.py
rename to python/llm/example/CPU/LangChain/native_int4/docqa.py
diff --git a/python/llm/example/langchain/native_int4/streamchat.py b/python/llm/example/CPU/LangChain/native_int4/streamchat.py
similarity index 100%
rename from python/llm/example/langchain/native_int4/streamchat.py
rename to python/llm/example/CPU/LangChain/native_int4/streamchat.py
diff --git a/python/llm/example/langchain/native_int4/voiceassistant.py b/python/llm/example/CPU/LangChain/native_int4/voiceassistant.py
similarity index 100%
rename from python/llm/example/langchain/native_int4/voiceassistant.py
rename to python/llm/example/CPU/LangChain/native_int4/voiceassistant.py
diff --git a/python/llm/example/langchain/transformers_int4/chat.py b/python/llm/example/CPU/LangChain/transformers_int4/chat.py
similarity index 100%
rename from python/llm/example/langchain/transformers_int4/chat.py
rename to python/llm/example/CPU/LangChain/transformers_int4/chat.py
diff --git a/python/llm/example/langchain/transformers_int4/docqa.py b/python/llm/example/CPU/LangChain/transformers_int4/docqa.py
similarity index 100%
rename from python/llm/example/langchain/transformers_int4/docqa.py
rename to python/llm/example/CPU/LangChain/transformers_int4/docqa.py
diff --git a/python/llm/example/langchain/transformers_int4/llm_math.py b/python/llm/example/CPU/LangChain/transformers_int4/llm_math.py
similarity index 100%
rename from python/llm/example/langchain/transformers_int4/llm_math.py
rename to python/llm/example/CPU/LangChain/transformers_int4/llm_math.py
diff --git a/python/llm/example/langchain/transformers_int4/voiceassistant.py b/python/llm/example/CPU/LangChain/transformers_int4/voiceassistant.py
similarity index 100%
rename from python/llm/example/langchain/transformers_int4/voiceassistant.py
rename to python/llm/example/CPU/LangChain/transformers_int4/voiceassistant.py
diff --git a/python/llm/example/transformers/native_int4/README.md b/python/llm/example/CPU/Native-Models/README.md
similarity index 100%
rename from python/llm/example/transformers/native_int4/README.md
rename to python/llm/example/CPU/Native-Models/README.md
diff --git a/python/llm/example/transformers/native_int4/native_int4_pipeline.py b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py
similarity index 100%
rename from python/llm/example/transformers/native_int4/native_int4_pipeline.py
rename to python/llm/example/CPU/Native-Models/native_int4_pipeline.py
diff --git a/python/llm/example/pytorch-models/README.md b/python/llm/example/CPU/PyTorch-Models/Model/README.md
similarity index 100%
rename from python/llm/example/pytorch-models/README.md
rename to python/llm/example/CPU/PyTorch-Models/Model/README.md
diff --git a/python/llm/example/pytorch-models/bark/README.md b/python/llm/example/CPU/PyTorch-Models/Model/bark/README.md
similarity index 100%
rename from python/llm/example/pytorch-models/bark/README.md
rename to python/llm/example/CPU/PyTorch-Models/Model/bark/README.md
diff --git a/python/llm/example/pytorch-models/bark/synthesize_speech.py b/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py
similarity index 100%
rename from python/llm/example/pytorch-models/bark/synthesize_speech.py
rename to python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py
diff --git a/python/llm/example/pytorch-models/bert/README.md b/python/llm/example/CPU/PyTorch-Models/Model/bert/README.md
similarity index 100%
rename from python/llm/example/pytorch-models/bert/README.md
rename to python/llm/example/CPU/PyTorch-Models/Model/bert/README.md
diff --git a/python/llm/example/pytorch-models/bert/extract_feature.py b/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py
similarity index 100%
rename from python/llm/example/pytorch-models/bert/extract_feature.py
rename to python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py
diff --git a/python/llm/example/pytorch-models/chatglm/README.md b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/README.md
similarity index 100%
rename from python/llm/example/pytorch-models/chatglm/README.md
rename to python/llm/example/CPU/PyTorch-Models/Model/chatglm/README.md
diff --git a/python/llm/example/pytorch-models/chatglm/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py
similarity index 100%
rename from python/llm/example/pytorch-models/chatglm/generate.py
rename to python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py
diff --git a/python/llm/example/pytorch-models/llama2/README.md b/python/llm/example/CPU/PyTorch-Models/Model/llama2/README.md
similarity index 100%
rename from python/llm/example/pytorch-models/llama2/README.md
rename to python/llm/example/CPU/PyTorch-Models/Model/llama2/README.md
diff --git a/python/llm/example/pytorch-models/llama2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py
similarity index 100%
rename from python/llm/example/pytorch-models/llama2/generate.py
rename to python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py
diff --git a/python/llm/example/pytorch-models/openai-whisper/readme.md b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/readme.md
similarity index 100%
rename from python/llm/example/pytorch-models/openai-whisper/readme.md
rename to python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/readme.md
diff --git a/python/llm/example/pytorch-models/openai-whisper/recognize.py b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py
similarity index 100%
rename from python/llm/example/pytorch-models/openai-whisper/recognize.py
rename to python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py
diff --git a/python/llm/example/CPU/PyTorch-Models/More-Data-Types/.keep b/python/llm/example/CPU/PyTorch-Models/More-Data-Types/.keep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/llm/example/CPU/PyTorch-Models/README.md b/python/llm/example/CPU/PyTorch-Models/README.md
new file mode 100644
index 00000000000..06860d4563d
--- /dev/null
+++ b/python/llm/example/CPU/PyTorch-Models/README.md
@@ -0,0 +1,7 @@
+# Running PyTorch model using BigDL-LLM on Intel CPU
+
+This folder contains examples of running any PyTorch model on BigDL-LLM (with "one-line code change"):
+
+- [Model](Model): examples of running PyTorch models (e.g., Openai Whisper, LLaMA2, ChatGLM2, Falcon, MPT, Baichuan2, etc.) using INT4 optimizations
+- [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (NF4/INT5/INT8, etc.)
+- [Save-Load](Save-Load): examples of saving and loading low-bit models
diff --git a/python/llm/example/CPU/PyTorch-Models/Save-Load/.keep b/python/llm/example/CPU/PyTorch-Models/Save-Load/.keep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/llm/example/CPU/README.md b/python/llm/example/CPU/README.md
new file mode 100644
index 00000000000..1344cbb6dcb
--- /dev/null
+++ b/python/llm/example/CPU/README.md
@@ -0,0 +1,18 @@
+# BigDL-LLM Examples on Intel CPU
+
+This folder contains examples of running BigDL-LLM on Intel CPU:
+
+- [HF-Transformers-AutoModels](HF-Transformers-AutoModels): running any Hugging Face Transformers model on BigDL-LLM (using the standard AutoModel APIs)
+- [PyTorch-Models](PyTorch-Models): running any PyTorch model on BigDL-LLM (with "one-line code change")
+- [Native-Models](Native-Models): converting & running LLM in `llama`/`chatglm`/`bloom`/`gptneox`/`starcoder` model family using native (cpp) implementation
+- [LangChain](LangChain): running LangChain applications on BigDL-LLM
+
+## System Support
+**Hardware**:
+- Intel® Core™ processors
+- Intel® Xeon® processors
+
+**Operating System**:
+- Ubuntu 20.04 or later
+- CentOS 7 or later
+- Windows 10/11, with or without WSL
diff --git a/python/llm/example/gpu/hf-transformers-models/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/README.md
similarity index 98%
rename from python/llm/example/gpu/hf-transformers-models/README.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/README.md
index 0798745b952..a2164718533 100644
--- a/python/llm/example/gpu/hf-transformers-models/README.md
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/README.md
@@ -21,6 +21,7 @@ You can use BigDL-LLM to run almost every Huggingface Transformer models with IN
 
 - Intel Arc™ A-Series Graphics
 - Intel Data Center GPU Flex Series
+- Intel Data Center GPU Max Series
 
 ## Recommended Requirements
 To apply Intel GPU acceleration, there’re several steps for tools installation and environment preparation.
diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/README.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/baichuan/README.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/README.md
diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/baichuan/generate.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/README.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/baichuan2/README.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/README.md
diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/baichuan2/generate.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
diff --git a/python/llm/example/gpu/hf-transformers-models/chatglm2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/README.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/chatglm2/README.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/README.md
diff --git a/python/llm/example/gpu/hf-transformers-models/chatglm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/chatglm2/generate.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
diff --git a/python/llm/example/gpu/hf-transformers-models/chatglm2/streamchat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/chatglm2/streamchat.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
diff --git a/python/llm/example/gpu/hf-transformers-models/chinese-llama2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/README.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/chinese-llama2/README.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/README.md
diff --git a/python/llm/example/gpu/hf-transformers-models/chinese-llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/chinese-llama2/generate.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
diff --git a/python/llm/example/gpu/hf-transformers-models/falcon/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/README.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/falcon/README.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/README.md
diff --git a/python/llm/example/transformers/transformers_int4/falcon/falcon-7b-instruct/modelling_RW.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py
similarity index 100%
rename from python/llm/example/transformers/transformers_int4/falcon/falcon-7b-instruct/modelling_RW.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py
diff --git a/python/llm/example/gpu/hf-transformers-models/falcon/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/falcon/generate.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
diff --git a/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py
diff --git a/python/llm/example/gpu/hf-transformers-models/gpt-j/readme.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/readme.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/gpt-j/readme.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/readme.md
diff --git a/python/llm/example/gpu/hf-transformers-models/internlm/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/README.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/internlm/README.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/README.md
diff --git a/python/llm/example/gpu/hf-transformers-models/internlm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/internlm/generate.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py
diff --git a/python/llm/example/gpu/hf-transformers-models/llama2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/README.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/llama2/README.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/README.md
diff --git a/python/llm/example/gpu/hf-transformers-models/llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/llama2/generate.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py
diff --git a/python/llm/example/gpu/hf-transformers-models/mpt/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/README.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/mpt/README.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/README.md
diff --git a/python/llm/example/gpu/hf-transformers-models/mpt/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/mpt/generate.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py
diff --git a/python/llm/example/gpu/hf-transformers-models/qwen/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/README.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/qwen/README.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/README.md
diff --git a/python/llm/example/gpu/hf-transformers-models/qwen/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/qwen/generate.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py
diff --git a/python/llm/example/gpu/hf-transformers-models/starcoder/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/starcoder/generate.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
diff --git a/python/llm/example/gpu/hf-transformers-models/starcoder/readme.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/readme.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/starcoder/readme.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/readme.md
diff --git a/python/llm/example/gpu/hf-transformers-models/voiceassistant/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/README.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/voiceassistant/README.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/README.md
diff --git a/python/llm/example/gpu/hf-transformers-models/voiceassistant/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/generate.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/voiceassistant/generate.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/generate.py
diff --git a/python/llm/example/gpu/hf-transformers-models/whisper/readme.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/readme.md
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/whisper/readme.md
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/readme.md
diff --git a/python/llm/example/gpu/hf-transformers-models/whisper/recognize.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
similarity index 100%
rename from python/llm/example/gpu/hf-transformers-models/whisper/recognize.py
rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/More-Data-Types/.keep b/python/llm/example/GPU/HF-Transformers-AutoModels/More-Data-Types/.keep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/README.md
new file mode 100644
index 00000000000..da1a13d6fb3
--- /dev/null
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/README.md
@@ -0,0 +1,7 @@
+# Running Hugging Face Transformers model using BigDL-LLM on Intel GPU
+
+This folder contains examples of running any Hugging Face Transformers model on BigDL-LLM (using the standard AutoModel APIs):
+
+- [Model](Model): examples of running Hugging Face Transformers models (e.g., LLaMA2, ChatGLM2, Falcon, MPT, Baichuan2, etc.) using INT4 optimizations
+- [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (NF4/INT5/INT8, etc.)
+- [Save-Load](Save-Load): examples of saving and loading low-bit models
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/.keep b/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/.keep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/.keep b/python/llm/example/GPU/PyTorch-Models/Model/.keep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/.keep b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/.keep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/llm/example/GPU/PyTorch-Models/README.md b/python/llm/example/GPU/PyTorch-Models/README.md
new file mode 100644
index 00000000000..ce5cd50efdf
--- /dev/null
+++ b/python/llm/example/GPU/PyTorch-Models/README.md
@@ -0,0 +1,7 @@
+# Running PyTorch model using BigDL-LLM on Intel GPU
+
+This folder contains examples of running any PyTorch model on BigDL-LLM (with "one-line code change"):
+
+- [Model](Model): examples of running PyTorch models (e.g., Openai Whisper, LLaMA2, ChatGLM2, Falcon, MPT, Baichuan2, etc.) using INT4 optimizations
+- [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (NF4/INT5/INT8, etc.)
+- [Save-Load](Save-Load): examples of saving and loading low-bit models
diff --git a/python/llm/example/GPU/PyTorch-Models/Save-Load/.keep b/python/llm/example/GPU/PyTorch-Models/Save-Load/.keep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/llm/example/gpu/qlora_finetuning/README.md b/python/llm/example/GPU/QLoRA-FineTuning/README.md
similarity index 100%
rename from python/llm/example/gpu/qlora_finetuning/README.md
rename to python/llm/example/GPU/QLoRA-FineTuning/README.md
diff --git a/python/llm/example/gpu/qlora_finetuning/export_merged_model.py b/python/llm/example/GPU/QLoRA-FineTuning/export_merged_model.py
similarity index 100%
rename from python/llm/example/gpu/qlora_finetuning/export_merged_model.py
rename to python/llm/example/GPU/QLoRA-FineTuning/export_merged_model.py
diff --git a/python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py b/python/llm/example/GPU/QLoRA-FineTuning/qlora_finetuning.py
similarity index 100%
rename from python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py
rename to python/llm/example/GPU/QLoRA-FineTuning/qlora_finetuning.py
diff --git a/python/llm/example/GPU/README.md b/python/llm/example/GPU/README.md
new file mode 100644
index 00000000000..8cb7c7211b3
--- /dev/null
+++ b/python/llm/example/GPU/README.md
@@ -0,0 +1,26 @@
+# BigDL-LLM Examples on Intel GPU
+
+This folder contains examples of running BigDL-LLM on Intel GPU:
+
+- [HF-Transformers-AutoModels](HF-Transformers-AutoModels): running any Hugging Face Transformers model on BigDL-LLM (using the standard AutoModel APIs)
+- [PyTorch-Models](PyTorch-Models): running any PyTorch model on BigDL-LLM (with "one-line code change")
+- [QLoRA-FineTuning](QLoRA-FineTuning): running QLoRA finetuning on BigDL-LLM
+
+
+## System Support
+**Hardware**:
+- Intel Arc™ A-Series Graphics
+- Intel Data Center GPU Flex Series
+- Intel Data Center GPU Max Series
+
+**Operating System**:
+- Ubuntu 20.04 or later (Ubuntu 22.04 is preferred)
+
+## Requirements
+To apply Intel GPU acceleration, there’re several steps for tools installation and environment preparation.
+
+Step 1, please refer to our [driver installation](https://dgpu-docs.intel.com/driver/installation.html) for general purpose GPU capabilities.
+> **Note**: IPEX 2.0.110+xpu requires Intel GPU Driver version is [Stable 647.21](https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html).
+
+Step 2, you also need to download and install [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html). OneMKL and DPC++ compiler are needed, others are optional.
+> **Note**: IPEX 2.0.110+xpu requires Intel® oneAPI Base Toolkit's version >= 2023.2.0.
diff --git a/python/llm/example/cpp-python/README.md b/python/llm/example/cpp-python/README.md
deleted file mode 100644
index 60d51707a7a..00000000000
--- a/python/llm/example/cpp-python/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# BigDL-LLM INT4 Inference Using Llama-Cpp-Python Format API
-
-In this example, we show how to run inference on converted INT4 model using llama-cpp-python format API.
-
-> **Note**: Currently model family LLaMA, GPT-NeoX, BLOOM and StarCoder are supported.
-
-## Prepare Environment
-We suggest using conda to manage environment:
-```bash
-conda create -n llm python=3.9
-conda activate llm
-
-pip install --pre --upgrade bigdl-llm[all]
-```
-
-## Convert Models using bigdl-llm
-Follow the instructions in [Convert model](https://github.com/intel-analytics/BigDL/tree/main/python/llm#convert-model).
-
-
-## Run the example
-```bash
-python ./int4_inference.py -m CONVERTED_MODEL_PATH -x MODEL_FAMILY -p PROMPT -t THREAD_NUM
-```
-arguments info:
-- `-m CONVERTED_MODEL_PATH`: **required**, path to the converted model
-- `-x MODEL_FAMILY`: **required**, the model family of the model specified in `-m`, available options are `llama`, `gptneox`, `bloom` and `starcoder`
-- `-p PROMPT`: question to ask. Default is `What is AI?`.
-- `-t THREAD_NUM`: specify the number of threads to use for inference. Default is `2`.
diff --git a/python/llm/example/cpp-python/int4_inference.py b/python/llm/example/cpp-python/int4_inference.py
deleted file mode 100644
index b7edcb68924..00000000000
--- a/python/llm/example/cpp-python/int4_inference.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#
-# Copyright 2016 The BigDL Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This would makes sure Python is aware there is more than one sub-package within bigdl,
-# physically located elsewhere.
-# Otherwise there would be module not found error in non-pip's setting as Python would
-# only search the first bigdl package and end up finding only one sub-package.
-
-import argparse
-
-def main(args):
-    model_family = args.model_family
-    model_path = args.model_path
-    prompt = args.prompt
-    n_threads = args.thread_num
-    
-    if model_family == "llama":     
-        from bigdl.llm.models import Llama
-        modelclass = Llama
-    if model_family == "bloom":   
-        from bigdl.llm.models import Bloom
-        modelclass = Bloom
-    if model_family == "gptneox": 
-        from bigdl.llm.models import Gptneox  
-        modelclass = Gptneox
-    if model_family == "starcoder":   
-        from bigdl.llm.models import Starcoder
-        modelclass = Starcoder
-        
-    model = modelclass(model_path, n_threads=n_threads)
-    response=model(prompt)
-    print(response)
-    
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Llama-CPP-Python style API Simple Example')
-    parser.add_argument('-x','--model-family', type=str, required=True,
-                        choices=["llama", "bloom", "gptneox", "starcoder"],
-                        help='the model family')
-    parser.add_argument('-m','--model-path', type=str, required=True,
-                        help='the path to the converted llm model')
-    parser.add_argument('-p', '--prompt', type=str, default='What is AI?',
-                        help='qustion you want to ask.')
-    parser.add_argument('-t','--thread-num', type=int, default=2,
-                        help='number of threads to use for inference')
-    args = parser.parse_args()
-    
-    main(args)
diff --git a/python/llm/example/gpu/README.md b/python/llm/example/gpu/README.md
deleted file mode 100644
index 1abff7e56cd..00000000000
--- a/python/llm/example/gpu/README.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# BigDL-LLM INT4 Optimization for Large Language Model on Intel GPUs
-You can use BigDL-LLM to run almost every Huggingface Transformer models with INT4 optimizations on your laptops with Intel GPUs. Moreover, you can also use `optimize_model` API to accelerate general PyTorch models on Intel GPUs.
-
-## Verified models
-| Model      | Example                                                  |
-|------------|----------------------------------------------------------|
-| Baichuan   | [link](hf-transformers-models/baichuan)          | 
-| Baichuan2   | [link](hf-transformers-models/baichuan2)          | 
-| ChatGLM2   | [link](hf-transformers-models/chatglm2)          |
-| Chinese Llama2 | [link](hf-transformers-models/chinese-llama2)|
-| Falcon     | [link](hf-transformers-models/falcon)            |
-| GPT-J      | [link](hf-transformers-models/gpt-j)             |
-| InternLM   | [link](hf-transformers-models/internlm)          |
-| LLaMA 2    | [link](hf-transformers-models/llama2)            |
-| MPT        | [link](hf-transformers-models/mpt)               |
-| Qwen       | [link](hf-transformers-models/qwen)              |
-| StarCoder  | [link](hf-transformers-models/starcoder)         |
-| Whisper    | [link](hf-transformers-models/whisper)           |
-
-## Verified Hardware Platforms
-
-- Intel Arc™ A-Series Graphics
-- Intel Data Center GPU Flex Series
-
-## Recommended Requirements
-To apply Intel GPU acceleration, there’re several steps for tools installation and environment preparation.
-
-Step 1, only Linux system is supported now, Ubuntu 22.04 is prefered.
-
-Step 2, please refer to our [driver installation](https://dgpu-docs.intel.com/driver/installation.html) for general purpose GPU capabilities.
-> **Note**: IPEX 2.0.110+xpu requires Intel GPU Driver version is [Stable 647.21](https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html).
-
-Step 3, you also need to download and install [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html). OneMKL and DPC++ compiler are needed, others are optional.
-> **Note**: IPEX 2.0.110+xpu requires Intel® oneAPI Base Toolkit's version >= 2023.2.0.
-
-## Best Known Configuration on Linux
-For better performance, it is recommended to set environment variables on Linux:
-```bash
-export USE_XETLA=OFF
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-```
diff --git a/python/llm/example/gpu/pytorch-models/README.md b/python/llm/example/gpu/pytorch-models/README.md
deleted file mode 100644
index 6c958e7a968..00000000000
--- a/python/llm/example/gpu/pytorch-models/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# BigDL-LLM INT4 Optimization for Large Language Model on Intel GPUs
-You can use `optimize_model` API to accelerate general PyTorch models on Intel servers and PCs. This directory contains example scripts to help you quickly get started using BigDL-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it.
-
-## Verified Hardware Platforms
-
-- Intel Arc™ A-Series Graphics
-- Intel Data Center GPU Flex Series
-
-## Recommended Requirements
-To apply Intel GPU acceleration, there’re several steps for tools installation and environment preparation.
-
-Step 1, only Linux system is supported now, Ubuntu 22.04 is prefered.
-
-Step 2, please refer to our [driver installation](https://dgpu-docs.intel.com/driver/installation.html) for general purpose GPU capabilities.
-> **Note**: IPEX 2.0.110+xpu requires Intel GPU Driver version is [Stable 647.21](https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html).
-
-Step 3, you also need to download and install [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html). OneMKL and DPC++ compiler are needed, others are optional.
-> **Note**: IPEX 2.0.110+xpu requires Intel® oneAPI Base Toolkit's version >= 2023.2.0.
-
-## Best Known Configuration on Linux
-For better performance, it is recommended to set environment variables on Linux:
-```bash
-export USE_XETLA=OFF
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-```
diff --git a/python/llm/example/transformers/transformers_int4/GPU/README.md b/python/llm/example/transformers/transformers_int4/GPU/README.md
deleted file mode 100644
index f12e7824f0d..00000000000
--- a/python/llm/example/transformers/transformers_int4/GPU/README.md
+++ /dev/null
@@ -1 +0,0 @@
-### The GPU examples for `bigdl-llm` have been moved to [here](../../../gpu).
diff --git a/python/llm/portable-executable/.gitignore b/python/llm/portable-executable/.gitignore
deleted file mode 100644
index 23c0161c874..00000000000
--- a/python/llm/portable-executable/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-python-embed
-portable-executable.zip
\ No newline at end of file
diff --git a/python/llm/portable-executable/README.md b/python/llm/portable-executable/README.md
deleted file mode 100644
index 0f1df88f1f4..00000000000
--- a/python/llm/portable-executable/README.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# BigDL-LLM Portable Executable For Windows: User Guide
-
-This portable executable includes everything you need to run LLM (except models). Please refer to How to use section to get started.
-
-## 13B model running on an Intel 11-Gen Core PC (real-time screen capture)
-
-<p align="left">
-            <img src=https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-screen-capture.gif width='80%' />
-
-</p>
-
-## Verified Models
-
-- ChatGLM2-6b
-- Baichuan-13B-Chat
-- Baichuan2-7B-Chat
-- internlm-chat-7b-8k
-- Llama-2-7b-chat-hf
-
-## How to use
-
-1. Download the model to your computer. Please ensure there is a file named `config.json` in the model folder, otherwise the script won't work.
-
-   ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step1.png)
-
-2. Run `chat.bat` in Terminal and input the path of the model (e.g. `path\to\model`, note that there's no slash at the end of the path).
-
-   ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step2.png)
-
-3. Press Enter and wait until model finishes loading. Then enjoy chatting with the model!
-4. If you want to stop chatting, just input `stop` and the model will stop running.
-
-   ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step34.png)
diff --git a/python/llm/portable-executable/setup.md b/python/llm/portable-executable/setup.md
deleted file mode 100644
index 22520c64075..00000000000
--- a/python/llm/portable-executable/setup.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# BigDL-LLM Portable Executable Setup Script For Windows
-
-# How to use
-
-Just simply run `setup.bat` and it will download and install all dependency and generate a zip file for user to use.
diff --git a/python/llm/portable-zip/.gitignore b/python/llm/portable-zip/.gitignore
new file mode 100644
index 00000000000..fa79eccd951
--- /dev/null
+++ b/python/llm/portable-zip/.gitignore
@@ -0,0 +1,2 @@
+python-embed
+bigdl-llm.zip
\ No newline at end of file
diff --git a/python/llm/portable-zip/README.md b/python/llm/portable-zip/README.md
new file mode 100644
index 00000000000..a8202f5567a
--- /dev/null
+++ b/python/llm/portable-zip/README.md
@@ -0,0 +1,37 @@
+# BigDL-LLM Portable Zip For Windows: User Guide
+
+## Introduction
+
+This portable zip includes everything you need to run an LLM with BigDL-LLM optimizations (except models) . Please refer to [How to use](#how-to-use) section to get started.
+
+### 13B model running on an Intel 11-Gen Core PC (real-time screen capture)
+
+<p align="left">
+            <img src=https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-screen-capture.gif width='80%' />
+
+</p>
+
+### Verified Models
+
+- ChatGLM2-6b
+- Baichuan-13B-Chat
+- Baichuan2-7B-Chat
+- internlm-chat-7b-8k
+- Llama-2-7b-chat-hf
+
+## How to use
+
+1. Download the zip from link [here]().
+2. (Optional) You could also build the zip on your own. Run `setup.bat` and it will generate the zip file.
+3. Unzip `bigdl-llm.zip`.
+4. Download the model to your computer. Please ensure there is a file named `config.json` in the model folder, otherwise the script won't work.
+
+   ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step1.png)
+
+5. Go into the unzipped folder and double click `chat.bat`. Input the path of the model (e.g. `path\to\model`, note that there's no slash at the end of the path). Press Enter and wait until model finishes loading. Then enjoy chatting with the model!
+
+   ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step2.png)
+
+6. If you want to stop chatting, just input `stop` and the model will stop running.
+
+   ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step34.png)
diff --git a/python/llm/portable-executable/chat.bat b/python/llm/portable-zip/chat.bat
similarity index 61%
rename from python/llm/portable-executable/chat.bat
rename to python/llm/portable-zip/chat.bat
index b02c961536f..832eb13d588 100644
--- a/python/llm/portable-executable/chat.bat
+++ b/python/llm/portable-zip/chat.bat
@@ -5,4 +5,6 @@
 set PYTHONUNBUFFERED=1
 
 set /p modelpath="Please enter the model path: "
-.\python-embed\python.exe .\chat.py --model-path="%modelpath%"
\ No newline at end of file
+.\python-embed\python.exe .\chat.py --model-path="%modelpath%"
+
+pause
\ No newline at end of file
diff --git a/python/llm/portable-executable/chat.py b/python/llm/portable-zip/chat.py
similarity index 100%
rename from python/llm/portable-executable/chat.py
rename to python/llm/portable-zip/chat.py
diff --git a/python/llm/portable-executable/setup.bat b/python/llm/portable-zip/setup.bat
similarity index 92%
rename from python/llm/portable-executable/setup.bat
rename to python/llm/portable-zip/setup.bat
index de8ad28c273..199902143ca 100644
--- a/python/llm/portable-executable/setup.bat
+++ b/python/llm/portable-zip/setup.bat
@@ -20,4 +20,4 @@ cd ..
 %python-embed% -m pip install bigdl-llm[all] transformers_stream_generator tiktoken einops colorama
 
 :: compress the python and scripts
-powershell -Command "Compress-Archive -Path '.\python-embed', '.\chat.bat', '.\chat.py', '.\README.md' -DestinationPath .\portable-executable.zip"
+powershell -Command "Compress-Archive -Path '.\python-embed', '.\chat.bat', '.\chat.py', '.\README.md' -DestinationPath .\bigdl-llm.zip"
diff --git a/python/llm/portable-zip/setup.md b/python/llm/portable-zip/setup.md
new file mode 100644
index 00000000000..5810a55981a
--- /dev/null
+++ b/python/llm/portable-zip/setup.md
@@ -0,0 +1,5 @@
+# BigDL-LLM Portable Zip Setup Script For Windows
+
+# How to use
+
+Just simply run `setup.bat` and it will download and install all dependency and generate `bigdl-llm.zip` for user to use.
diff --git a/python/llm/src/bigdl/llm/ggml/quantize.py b/python/llm/src/bigdl/llm/ggml/quantize.py
index 7023a4bdb6a..579ee913ce6 100644
--- a/python/llm/src/bigdl/llm/ggml/quantize.py
+++ b/python/llm/src/bigdl/llm/ggml/quantize.py
@@ -31,7 +31,8 @@
                      "asym_int5": 7,  # q5_1 in ggml
                      "sym_int8": 8,   # q8_0 in ggml
                      "nf4": 10,
-                     "nf3": 11}
+                     "nf3": 11,
+                     "fp16": 12}
 
 _llama_quantize_type = {"q4_0": 2,
                         "q4_1": 3,
@@ -71,7 +72,7 @@ def quantize(input_path: str, output_path: str,
     :param dtype: Quantization method which differs in the resulting model disk size and
             inference speed. Defalut to `q4_0`. Difference model family may support
             different types, now the supported list is:
-            llama : "q4_0", "q4_1", "q4_2"
+            llama : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
             bloom : "q4_0", "q4_1"
             gptneox : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
             starcoder : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
diff --git a/python/llm/src/bigdl/llm/optimize.py b/python/llm/src/bigdl/llm/optimize.py
index dfb0c344764..3169660c067 100644
--- a/python/llm/src/bigdl/llm/optimize.py
+++ b/python/llm/src/bigdl/llm/optimize.py
@@ -24,6 +24,12 @@
 from accelerate.utils import set_module_tensor_to_device
 from bigdl.llm.ggml.quantize import ggml_tensor_qtype
 from bigdl.llm.utils.common import invalidInputError
+from bigdl.llm.transformers.utils import extract_local_archive_file, get_local_shard_files
+import transformers
+from transformers import PreTrainedModel
+from .utils.common import MuteHFLogger
+from .utils.lazy_load_torch import LazyLoadTensors
+from contextlib import ExitStack, contextmanager
 
 
 # Simulate the Hugging Face format
@@ -37,7 +43,14 @@ def _save_low_bit(self, save_dir, *args, **kwargs):
                       f" load_in_4bit or load_in_low_bit parameter to load a 4-bit model first.")
     os.makedirs(save_dir, exist_ok=True)
     model_path = os.path.join(save_dir, PYTORCH_MODEL_NAME)
-    torch.save(self.state_dict(), model_path, *args, **kwargs)
+    if isinstance(self, PreTrainedModel):
+        # We borrowed this method to adapt to Transformer model cases
+        # as much as possible, and later we may merge these two situations
+        self.save_pretrained(save_dir)
+    else:
+        # TODO: For the lowbit model still larger than 8GB,
+        #       save it into shards.
+        torch.save(self.state_dict(), model_path, *args, **kwargs)
     with open(os.path.join(save_dir, CONFIG_NAME), "w") as json_file:
         json.dump(self._bigdl_config, json_file)
 
@@ -49,14 +62,44 @@ class DisableTorchAllocTensor():
     def __init__(self) -> None:
         self._old_torch_load_state_dict = Module.load_state_dict
         self._old_torch_to_device = Module.to
+        self._old_torch_load_from_state_dict = Module._load_from_state_dict
+        # Chatglm2 init weights manually,
+        # and `skip_init` init on `cpu` by default
+        self._old_skip_init = torch.nn.utils.skip_init
 
     def __enter__(self):
         Module.load_state_dict = lambda *args, **kwargs: _IncompatibleKeys([], [])
+        Module._load_from_state_dict = lambda *args, **kwargs: None
         Module.to = lambda self, *args, **kwargs: self
 
+        def skip_init_on_meta(module_cls, *args, **kwargs):
+            kwargs['device'] = 'meta'
+            return self._old_skip_init(module_cls, *args, **kwargs)
+        torch.nn.utils.skip_init = skip_init_on_meta
+
     def __exit__(self, exc_type, exc_value, traceback):
         Module.load_state_dict = self._old_torch_load_state_dict
+        Module._load_from_state_dict = self._old_torch_load_from_state_dict
         Module.to = self._old_torch_to_device
+        torch.nn.utils.skip_init = self._old_skip_init
+
+
+class ContextManagers:
+    """
+    Wrapper for `contextlib.ExitStack` which enters a collection of context managers.
+    Adaptation of `ContextManagers` in the `fastcore` library.
+    """
+
+    def __init__(self, context_managers):
+        self.context_managers = context_managers
+        self.stack = ExitStack()
+
+    def __enter__(self):
+        for context_manager in self.context_managers:
+            self.stack.enter_context(context_manager)
+
+    def __exit__(self, *args, **kwargs):
+        self.stack.__exit__(*args, **kwargs)
 
 
 def low_bit_sanity_check(model_path):
@@ -76,31 +119,49 @@ def low_bit_sanity_check(model_path):
     return low_bit
 
 
-def load_low_bit(model_or_creator, model_path, **kwargs):
-    is_creator = not isinstance(model_or_creator, torch.nn.Module) \
-        and callable(model_or_creator)
-    low_bit = low_bit_sanity_check(model_path)
+@contextmanager
+def low_memory_init():
+    init_contexts = []
+    init_contexts.extend([init_empty_weights(), DisableTorchAllocTensor()])
+    # Load everything except Tensors' parameters
+    init_contexts.append(LazyLoadTensors())
+    # As we have muted the `torch.load`, this will trigger a key missing warning in hf
+    # but this matters not for we will load again later.
+    init_contexts.append(MuteHFLogger(logger=transformers.modeling_utils.logger))
+    with ContextManagers(init_contexts):
+        yield
+
 
+def load_low_bit(model, model_path):
+    low_bit = low_bit_sanity_check(model_path)
+    invalidInputError(isinstance(model, torch.nn.Module),
+                      "model should be a instance of "
+                      f"`torch.nn.Module`, but got {type(model)} at last.")
     if low_bit:
-        # a creator
-        if is_creator:
-            with init_empty_weights(), DisableTorchAllocTensor():
-                model = model_or_creator(**kwargs)
-        else:
-            model = model_or_creator
         invalidInputError(isinstance(model, torch.nn.Module),
-                          "model_or_creator should be a instance of "
-                          "`torch.nn.Module`or a method that returns "
-                          f"an instance of `torch.nn.Module`, but got {type(model)} at last.")
+                          "model should be an instance of `torch.nn.Module`, "
+                          f"but got {type(model)} at last.")
+        invalidInputError(model.device.type in ('cpu', 'meta'),
+                          "Expect model on device `cpu` or `meta`, "
+                          f"but got device type {model.device.type}")
         qtype = ggml_tensor_qtype[low_bit]
         model = ggml_convert_low_bit(model, qtype=qtype, convert_shape_only=True)
 
-    state_dict = torch.load(os.path.join(model_path, PYTORCH_MODEL_NAME))
-    if is_creator:
+    resolved_archive_file, is_sharded = extract_local_archive_file(model_path, subfolder="")
+    if is_sharded:
+        # For now only shards transformers models
+        # can run in this branch.
+        resolved_archive_file, _ = \
+            get_local_shard_files(model_path,
+                                  resolved_archive_file,
+                                  subfolder="")
+    else:
+        resolved_archive_file = [os.path.join(model_path, PYTORCH_MODEL_NAME)]
+
+    for model_file in resolved_archive_file:
+        state_dict = torch.load(model_file)
         for param_name, param in state_dict.items():
             set_module_tensor_to_device(model, param_name, "cpu", param)
-    else:
-        model.load_state_dict(state_dict=state_dict)
     return model
 
 
@@ -118,6 +179,12 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True):
     invalidInputError(low_bit in ggml_tensor_qtype,
                       f"Unknown load_in_low_bit value: {low_bit}, expected:"
                       f" sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.")
+    invalidInputError(isinstance(model, torch.nn.Module),
+                      "model should be an instance of "
+                      f"`torch.nn.Module`, but got {type(model)} at last.")
+    invalidInputError(model.device.type == 'cpu',
+                      "Expect model on device `cpu`, "
+                      f"but got device type {model.device.type}")
     qtype = ggml_tensor_qtype[low_bit]
     model = ggml_convert_low_bit(model, qtype=qtype, optimize_model=optimize_llm)
     # add save_low_bit to pretrained model dynamically
diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/bigdl/llm/transformers/convert.py
index b0bc581d9ec..e0c762335ff 100644
--- a/python/llm/src/bigdl/llm/transformers/convert.py
+++ b/python/llm/src/bigdl/llm/transformers/convert.py
@@ -41,12 +41,13 @@
 import warnings
 import transformers
 import importlib
+from bigdl.llm.ggml.quantize import ggml_tensor_qtype
 from .utils import logger
 
 
 def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                                  current_key_name=None, convert_shape_only=False):
-    from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params
+    from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params, FP16Linear
     has_been_replaced = False
 
     for name, module in model.named_children():
@@ -57,33 +58,55 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
             # Check if the current key is not in the `modules_to_not_convert`
             if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
                 with init_empty_weights():
-                    new_linear = LowBitLinear(
-                        module.in_features,
-                        module.out_features,
-                        qtype,
-                        module.bias is not None,
-                    )
+                    new_linear = None
+                    if qtype != ggml_tensor_qtype["fp16"]:
+                        new_linear = LowBitLinear(
+                            module.in_features,
+                            module.out_features,
+                            qtype,
+                            module.bias is not None,
+                        )
+
+                        device_type = module.weight.data.device.type
+                        # Copy the weights
+                        paramsLowBit = FP4Params(data=module.weight.data,
+                                                 requires_grad=False,
+                                                 quantized=False,
+                                                 _shape=None,
+                                                 convert_shape_only=convert_shape_only,
+                                                 qtype=qtype).to(device_type)
+                        new_linear._parameters['weight'] = paramsLowBit
+                    else:
+                        #  only support two size now
+                        #  may generalize to other sizes
+                        if module.in_features in [4096, 11008]:
+                            # esimd fp16 path
+                            new_linear = FP16Linear(
+                                module.in_features,
+                                module.out_features,
+                                qtype,
+                                module.bias is not None,
+                            )
+                            device_type = module.weight.data.device.type
 
-                    device_type = module.weight.data.device.type
-                    # Copy the weights
-                    paramsLowBit = FP4Params(data=module.weight.data,
-                                             requires_grad=False,
-                                             quantized=False,
-                                             _shape=None,
-                                             convert_shape_only=convert_shape_only,
-                                             qtype=qtype).to(device_type)
-                    new_linear._parameters['weight'] = paramsLowBit
+                            # convert here
+                            m, n = module.weight.data.shape
+                            trans_weight = module.weight.data.reshape(m//16, 16, n)
+                            trans_weight = trans_weight.transpose(1, 2).contiguous()
+                            new_linear._parameters['weight'] = nn.Parameter(trans_weight)
 
-                    if module.bias is not None:
-                        new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
-                            .to(device_type)
+                    #  fp16 may generalize to other sizes later
+                    if new_linear is not None:
+                        if module.bias is not None:
+                            new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
+                                .to(device_type)
 
-                    model._modules[name] = new_linear
-                    has_been_replaced = True
-                    # Force requires grad to False to avoid unexpected errors
-                    model._modules[name].requires_grad_(False)
+                        model._modules[name] = new_linear
+                        has_been_replaced = True
+                        # Force requires grad to False to avoid unexpected errors
+                        model._modules[name].requires_grad_(False)
 
-                    module.weight = None
+                        module.weight = None
 
         # Remove the last key for recursion
         if len(list(module.children())) > 0:
diff --git a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
index 931a118f47c..0e3b7dbad46 100644
--- a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
+++ b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
@@ -288,10 +288,10 @@ def ggml_matmul_src1_x_src0_t(src0: torch.Tensor,
 class MatMulLowBit(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx, A, weight):
+    def forward(ctx, A, weight, input_seq_size):
         ctx.is_empty = False
         import linear_q4_0
-        result = linear_q4_0.forward_new(A, weight.data, weight.qtype)
+        result = linear_q4_0.forward_new(A, weight.data, weight.qtype, input_seq_size)
         if any(ctx.needs_input_grad[:2]):
             ctx.tensors = (A, weight)
         else:
@@ -304,14 +304,14 @@ def backward(ctx, grad_output):
         if ctx.is_empty:
             bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias)
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
-        req_gradA, _ = ctx.needs_input_grad
+        req_gradA, _, _ = ctx.needs_input_grad
         A, weight = ctx.tensors
         grad_A, grad_weight = None, None
         if req_gradA:
             dequant_weight = linear_q4_0.dequant(A, weight.data, weight.qtype)
             grad_A = torch.matmul(grad_output, dequant_weight.reshape(weight._shape))
 
-        return grad_A, grad_weight
+        return grad_A, grad_weight, None
 
 
 class LowBitLinear(nn.Linear):
@@ -353,10 +353,12 @@ def forward(self, x: torch.Tensor):
             # disable the conversion when training
             if self.conver_to_half and x_2d.shape[0] > 1 and x_2d.dtype == torch.float32:
                 x_2d = x_2d.half()
+            input_seq_size = x_shape[1]
             if self.training and x_2d.requires_grad:
-                result = MatMulLowBit.apply(x_2d, self.weight)
+                result = MatMulLowBit.apply(x_2d, self.weight, input_seq_size)
             else:
-                result = linear_q4_0.forward_new(x_2d, self.weight.data, self.weight.qtype)
+                result = linear_q4_0.forward_new(x_2d, self.weight.data, self.weight.qtype,
+                                                 input_seq_size)
             new_shape = x_shape[:-1] + (self.out_len,)
             result = result.view(new_shape)
             if self.bias is not None:
@@ -378,3 +380,53 @@ def forward(self, x: torch.Tensor):
                     result += self.bias
 
         return result.to(x.dtype)
+
+
+class FP16Linear(nn.Linear):
+    def __init__(self, input_features, output_features, qtype, bias=True,
+                 conver_to_half=True):
+        super().__init__(input_features, output_features, bias)
+        self.in_len = input_features
+        self.out_len = output_features
+        self.weight_shape = (self.out_len, self.in_len)
+        self.weight_length = self.out_len * self.in_len
+        self.qtype = qtype
+        self.conver_to_half = conver_to_half
+
+    def forward(self, x: torch.Tensor):
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        x_shape = x.shape
+        x_2d = x.view(-1, x_shape[-1])
+
+        x0 = self.weight.data
+        # only work for GPU
+        invalidInputError(x0.device.type == "xpu",
+                          "FP16 only works for GPU")
+        try:
+            import intel_extension_for_pytorch
+            import linear_fp16_esimd
+        except ModuleNotFoundError:
+            invalidInputError(False,
+                              "Please `pip install bigdl_core_xe` first.")
+
+        if x_2d.is_contiguous() is False:
+            x_2d = x_2d.contiguous()
+
+        if x_2d.shape[0] > 1:
+            # first token or batch size > 1, re-convert weight
+            original_weight = self.weight.data.transpose(1, 2)
+            original_weight = original_weight.reshape(self.out_len, self.in_len)
+            result = F.linear(x_2d, original_weight.contiguous())
+            del original_weight
+        else:
+            # rest token, use esimd optimization
+            result = linear_fp16_esimd.forward(x_2d, self.weight.data)
+
+        new_shape = x_shape[:-1] + (self.out_len,)
+        result = result.view(new_shape)
+        if self.bias is not None:
+            result += self.bias
+
+        return result.to(x.dtype)
diff --git a/python/llm/src/bigdl/llm/transformers/model.py b/python/llm/src/bigdl/llm/transformers/model.py
index 8f926517e16..51b2f656f2b 100644
--- a/python/llm/src/bigdl/llm/transformers/model.py
+++ b/python/llm/src/bigdl/llm/transformers/model.py
@@ -60,7 +60,7 @@ def from_pretrained(cls,
         :param load_in_4bit: boolean value, True means load linear's weight to symmetric int 4.
                              Default to be False.
         :param load_in_low_bit: str value, options are sym_int4, asym_int4, sym_int5, asym_int5
-                                or sym_int8. sym_int4 means symmetric int 4, asym_int4 means
+                                , sym_int8 or fp16. sym_int4 means symmetric int 4, asym_int4 means
                                 asymmetric int 4, etc. Relevant low bit optimizations will
                                 be applied to the model.
         :param optimize_model: boolean value, Whether to further optimize the low_bit llm model.
@@ -104,8 +104,9 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs):
         from .convert import ggml_convert_low_bit
         invalidInputError(q_k in ggml_tensor_qtype,
                           f"Unknown load_in_low_bit value: {q_k}, expected:"
-                          f" sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.")
+                          f" sym_int4, asym_int4, sym_int5, asym_int5, sym_int8 or fp16.")
         qtype = ggml_tensor_qtype[q_k]
+
         # In case it needs a second try,
         # `from_pretrained`` may pop items out in dict
         # and lead to args missing.
diff --git a/python/llm/src/bigdl/llm/transformers/utils.py b/python/llm/src/bigdl/llm/transformers/utils.py
index c0fa9d2a9a6..499765e102f 100644
--- a/python/llm/src/bigdl/llm/transformers/utils.py
+++ b/python/llm/src/bigdl/llm/transformers/utils.py
@@ -55,7 +55,7 @@
 WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
 
 
-def extract_local_archive_file(pretrained_model_name_or_path, subfolder, variant):
+def extract_local_archive_file(pretrained_model_name_or_path, subfolder, variant=None):
     pretrained_model_name_or_path = str(pretrained_model_name_or_path)
     if os.path.isfile(
         os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant))
diff --git a/python/llm/src/bigdl/llm/utils/lazy_load_torch.py b/python/llm/src/bigdl/llm/utils/lazy_load_torch.py
new file mode 100644
index 00000000000..4d205b684dd
--- /dev/null
+++ b/python/llm/src/bigdl/llm/utils/lazy_load_torch.py
@@ -0,0 +1,193 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===========================================================================
+#
+# This file is adapted from
+# https://github.com/ggerganov/llama.cpp/blob/master/convert.py#L516
+#
+# MIT License
+#
+# Copyright (c) 2023 Georgi Gerganov
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+import torch
+from torch.serialization import StorageType
+import pickle
+import zipfile
+import io
+from typing import Dict, IO, Any, Callable
+from dataclasses import dataclass
+from .common import invalidInputError
+
+
+item_size = {torch.bfloat16: 2,
+             torch.float16: 2,
+             torch.int: 4,
+             torch.float: 4,
+             torch.float32: 4,
+             torch.int8: 1}
+
+
+@dataclass
+class LazyStorage:
+    load: Callable[[int, int], torch.Tensor]
+    kind: StorageType
+    description: str
+
+
+@dataclass
+class LazyTensor:
+    _load: Callable[[], torch.Tensor]
+    shape: list[int]
+    data_type: torch.dtype
+    description: str
+
+    def load(self) -> torch.Tensor:
+        ret = self._load()
+        return ret
+
+    def to(self, data_type):
+        # self.validate_conversion_to(data_type)
+
+        def load() -> torch.Tensor:
+            print(f"to {data_type}")
+            return self.load().to(data_type)
+        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
+
+
+def _load(pickle_fp, map_location, picklemoudle, pickle_file='data.pkl', zip_file=None):
+
+    load_module_mapping: Dict[str, str] = {
+        'torch.tensor': 'torch._tensor'
+    }
+
+    class LazyUnpickler(picklemoudle.Unpickler):
+        def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
+            super().__init__(fp)
+            self.data_base_path = data_base_path
+            self.zip_file = zip_file
+
+        def persistent_load(self, pid):
+            data_type = pid[1].dtype
+            filename_stem = pid[2]
+            filename = f'{self.data_base_path}/{filename_stem}'
+            info = self.zip_file.getinfo(filename)
+
+            def load(offset: int, elm_count: int):
+                dtype = data_type
+                fp = self.zip_file.open(info)
+                fp.seek(offset * item_size[dtype])
+                size = elm_count * item_size[dtype]
+                data = fp.read(size)
+                return torch.frombuffer(bytearray(data), dtype=dtype)
+            description = f'storage data_type={data_type} ' \
+                          'path-in-zip={filename} path={self.zip_file.filename}'
+            return LazyStorage(load=load, kind=pid[1], description=description)
+
+        @staticmethod
+        def lazy_rebuild_tensor_v2(storage: Any,
+                                   storage_offset: Any,
+                                   size: Any,
+                                   stride: Any,
+                                   requires_grad: Any,
+                                   backward_hooks: Any,
+                                   metadata: Any = None) -> LazyTensor:
+            invalidInputError(isinstance(storage, LazyStorage),
+                              "storage should be an instance of class `LazyStorage`, "
+                              f"but get {type(storage)}.")
+
+            def load() -> torch.Tensor:
+                elm_count = stride[0] * size[0]
+                return storage.load(storage_offset, elm_count).reshape(size)
+            description = f'pickled storage_offset={storage_offset} in {storage.description}'
+            return LazyTensor(load, list(size), storage.kind.dtype, description)
+
+        @staticmethod
+        def rebuild_from_type_v2(func, new_type, args, state):
+            return func(*args)
+
+        CLASSES: dict[tuple[str, str], Any] = {
+            ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
+            ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
+            ('torch', 'Tensor'): LazyTensor,
+        }
+
+        def find_class(self, mod_name, name):
+            if (mod_name, name) in self.CLASSES:
+                return self.CLASSES[(mod_name, name)]
+            if type(name) is str and 'Storage' in name:
+                try:
+                    return StorageType(name)
+                except KeyError:
+                    pass
+            mod_name = load_module_mapping.get(mod_name, mod_name)
+            return super().find_class(mod_name, name)
+
+    unpickler = LazyUnpickler(pickle_fp,
+                              data_base_path=pickle_file,
+                              zip_file=zip_file)
+    result = unpickler.load()
+
+    return result
+
+
+# This can only be used on huggingface transformers loaded from a zip file.
+def lazyload(
+    f,
+    *args,
+    **kwargs
+):
+    if isinstance(f, io.BufferedIOBase):
+        fp = f
+    else:
+        fp = open(f, 'rb')
+    zf = zipfile.ZipFile(fp)
+    pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
+    invalidInputError(len(pickle_paths) == 1,
+                      "There should be only one pickle_paths found, "
+                      f"but get {pickle_paths}. ")
+    pickle_fp = zf.open(pickle_paths[0], 'r')
+    state_dict = _load(pickle_fp, None, pickle, pickle_file=pickle_paths[0][:-4], zip_file=zf)
+    return state_dict
+
+
+class LazyLoadTensors:
+    def __init__(self):
+        self.torch_load = torch.load
+
+    def __enter__(self):
+        torch.load = lazyload
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.load = self.torch_load
diff --git a/python/llm/test/convert/test_convert_model.py b/python/llm/test/convert/test_convert_model.py
index e8cc30792a0..1a0495d6d76 100644
--- a/python/llm/test/convert/test_convert_model.py
+++ b/python/llm/test/convert/test_convert_model.py
@@ -22,6 +22,7 @@
 
 from bigdl.llm import llm_convert
 from bigdl.llm.transformers import AutoModelForCausalLM
+from bigdl.llm.optimize import optimize_model, load_low_bit, low_memory_init
 
 
 llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH')
@@ -87,5 +88,22 @@ def test_transformer_convert_llama_save_load(self):
             newModel = AutoModelForCausalLM.load_low_bit(tempdir)
             assert newModel is not None
 
+    def test_optimize_transformers_llama(self):
+        from transformers import AutoModelForCausalLM as AutoCLM
+        with tempfile.TemporaryDirectory(dir=output_dir) as tempdir:
+            model = AutoCLM.from_pretrained(llama_model_path,
+                                            torch_dtype="auto",
+                                            low_cpu_mem_usage=True,
+                                            trust_remote_code=True)
+            model = optimize_model(model)
+            model.save_low_bit(tempdir)
+            with low_memory_init():
+                new_model = AutoCLM.from_pretrained(tempdir,
+                                                torch_dtype="auto",
+                                                trust_remote_code=True)
+            new_model = load_low_bit(new_model,
+                                model_path=tempdir)
+            assert new_model is not None
+
 if __name__ == '__main__':
     pytest.main([__file__])