Merge branch 'intel-analytics:main' into cicd_release_zip

intel-analytics · Oct 9, 2023 · 9d61166 · 9d61166
2 parents 49da199 + 05ffcda
commit 9d61166
Show file tree

Hide file tree

Showing 146 changed files with 943 additions and 410 deletions.
diff --git a/.github/actions/ppml/ppml-occlum-EDMM-exampletests-action/action.yml b/.github/actions/ppml/ppml-occlum-EDMM-exampletests-action/action.yml
@@ -0,0 +1,192 @@
+name: 'Run PPML Occlum EDMM ExampleTests'
+description: 'Run PPML Occlum EDMM ExampleTests'
+inputs:
+  image:
+    description: 'image'
+    required: true
+    default: '10.239.45.10/arda/intelanalytics/bigdl-ppml-trusted-big-data-ml-scala-occlum'
+  image-tag:
+    description: 'image tag'
+    required: true
+    default: '2.4.0-SNAPSHOT-EDMM6'
+runs:
+  using: "composite"
+  steps:
+    - name: Run tests
+      shell: bash
+      env:
+        DEFAULT_IMAGE: ${{ inputs.image }}:${{ inputs.image-tag }}
+      run: |
+        whoami
+
+        # icx-6's kernel support EDMM
+        export LOCAL_IP=172.168.0.210
+        export CPUSET="6-10"
+        export CONTAINER_NAME="spark-occlum-edmm-jenkins"
+
+        export DATA_PATH=/home/icx/glorysdj/data
+        export KEYS_PATH=/home/icx/glorysdj/keys
+        export SECURE_PASSWORD_PATH=/home/icx/glorysdj/password
+        export SGX_MEM_SIZE=30GB
+        export SGX_KERNEL_HEAP=2GB
+        export IMAGE=${{ env.DEFAULT_IMAGE }}
+
+        docker pull $IMAGE
+        docker pull intelanalytics/bigdl-ppml-trusted-big-data-ml-scala-occlum:2.4.0-SNAPSHOT-EDMM
+
+        docker stop $CONTAINER_NAME
+        docker rm -f $CONTAINER_NAME
+
+        docker run -itd \
+            --net=host \
+            --cpuset-cpus=$CPUSET \
+            --oom-kill-disable \
+            --device=/dev/sgx/enclave \
+            --device=/dev/sgx/provision \
+            -v /var/run/aesmd/aesm.socket:/var/run/aesmd/aesm.socket \
+            -v $DATA_PATH:/opt/occlum_spark/data \
+            -v $KEYS_PATH:/opt/keys \
+            --name=$CONTAINER_NAME \
+            -e LOCAL_IP=$LOCAL_IP \
+            -e SGX_MEM_SIZE=$SGX_MEM_SIZE \
+            -e SGX_KERNEL_HEAP=$SGX_KERNEL_HEAP \
+            $IMAGE \
+            bash  -c "tail -f /dev/null"
+
+        status_1_spark_pi=1
+        status_2_bigdl_lenet_mnist=1
+        status_3_bigdl_resnet_cifar10=1
+        status_4_spark_tpch=1
+        status_5_spark_ut=0
+        status_6_spark_xgboost=1
+        status_7_spark_gbt=1
+        status_8_pyspark_sklearn=1
+        status_9_pyspark_sql=1
+        status_10_pyspark_tpch=1
+        status_11_spark_lgbm=1
+
+        if [ $status_1_spark_pi -ne 0 ]; then
+        echo "################## start spark pi"
+        echo "example.1 spark pi"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             bash run_spark_on_occlum_glibc.sh pi | tee test-spark-pi-sgx.log && \
+             cat test-spark-pi-sgx.log | egrep 'Pi is roughly 3'"
+        status_1_spark_pi=$(echo $?)
+        fi
+
+        if [ $status_2_bigdl_lenet_mnist -ne 0 ]; then
+        echo "################## start bigdl lenet mnist"
+        echo "example.2 bigdl lenet mnist"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             sed -i 's#        run_spark_lenet_mnist#        run_spark_lenet_mnist -b 4 -e 1#g' run_spark_on_occlum_glibc.sh && \
+             sed -i 's#                -f /host/data#                -f /host/data/lenet#g' run_spark_on_occlum_glibc.sh && \
+             bash run_spark_on_occlum_glibc.sh lenet -b 8 -e 1 | tee bigdl-lenet-mnist.log && \
+             cat bigdl-lenet-mnist.log | egrep 'records/second. Loss is' && \
+             sed -i 's#                -f /host/data/lenet#                -f /host/data#g' run_spark_on_occlum_glibc.sh"
+        status_2_bigdl_lenet_mnist=$(echo $?)
+        fi
+
+        if [ $status_3_bigdl_resnet_cifar10 -ne 0 ]; then
+        echo "################## start bigdl resnet cifar10"
+        echo "example.3 bigdl resnet cifar10"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             sed -i 's#        run_spark_resnet_cifar#        run_spark_resnet_cifar --nEpochs 1#g' run_spark_on_occlum_glibc.sh && \
+             sed -i 's#                -f /host/data#                -f /host/data/cifar#g' run_spark_on_occlum_glibc.sh && \
+             bash run_spark_on_occlum_glibc.sh resnet | tee bigdl-resnet-cifar10.log && \
+             cat bigdl-resnet-cifar10.log | egrep 'Current momentum is '&& \
+             sed -i 's#                -f /host/data/cifar#                -f /host/data#g' run_spark_on_occlum_glibc.sh"
+        status_3_bigdl_resnet_cifar10=$(echo $?)
+        fi
+
+        if [ $status_4_spark_tpch -ne 0 ]; then
+        echo "################## start spark tpch"
+        echo "example.4 spark tpch"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             sed -i 's#spark.driver.memory=12g#spark.driver.memory=2g#g' run_spark_on_occlum_glibc.sh && \
+             sed -i 's#spark.executor.instances=8#spark.executor.instances=2#g' run_spark_on_occlum_glibc.sh && \
+             sed -i 's#executor-memory 8G#executor-memory 2G#g' run_spark_on_occlum_glibc.sh && \
+             sed -i 's#-Xmx78g -Xms78g#-Xmx10g -Xms10g#g' run_spark_on_occlum_glibc.sh && \
+             sed -i 's#/host/data /host/data/output#/host/data/tpch /host/data/output#g' run_spark_on_occlum_glibc.sh && \
+             bash run_spark_on_occlum_glibc.sh tpch | tee spark-tpch.log && \
+             cat spark-tpch.log | egrep '22 finished-'"
+        status_4_spark_tpch=$(echo $?)
+        fi
+
+        if [ $status_5_spark_ut -ne 0 ]; then
+        echo "################## start spark unit test"
+        echo "example.5 spark unit test"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             sed -i 's#192.168.0.111#$LOCAL_IP#g' run_spark_on_occlum_glibc.sh && \
+             bash run_spark_on_occlum_glibc.sh ut | tee spark-unit-test.log && \
+             cat spark-unit-test.log | egrep 'FINISHED o.a.s.status.api.v1.sql.SqlResourceSuite:'"
+        status_5_spark_ut=$(echo $?)
+        fi
+
+        if [ $status_6_spark_xgboost -ne 0 ]; then
+        echo "################## start spark xgboost"
+        echo "example.6 spark xgboost"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+          sed -i 's#-i /host/data -s /host/data/model -t 2 -r 100 -d 2 -w 1#-i /host/data/xgboost -s /host/data/xgboost/model -t 2 -r 10 -d 2 -w 1#g' run_spark_on_occlum_glibc.sh && \
+          bash run_spark_on_occlum_glibc.sh xgboost | tee spark-xgboost.log && \
+          cat spark-xgboost.log | egrep 'end time is'"
+        status_6_spark_xgboost=$(echo $?)
+        fi
+
+        if [ $status_7_spark_gbt -ne 0 ]; then
+        echo "################## start spark gbt"
+        echo "example.7 spark gbt"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+          sed -i 's#-i /host/data -s /host/data/model -I 100 -d 5#-i /host/data/gbt -s /host/data/gbt/model -I 10 -d 5#g' run_spark_on_occlum_glibc.sh && \
+          bash run_spark_on_occlum_glibc.sh gbt | tee spark-gbt.log && \
+          cat spark-gbt.log | egrep 'end time is'"
+        status_7_spark_gbt=$(echo $?)
+        fi
+        
+        if [ $status_8_pyspark_sklearn -ne 0 ]; then
+        echo "################## start pyspark sklearn Linear Regression"
+        echo "example.8 pyspark sklearn"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             bash run_spark_on_occlum_glibc.sh pysklearn | tee test-pyspark-sklearn-sgx.log && \
+             cat test-pyspark-sklearn-sgx.log | egrep 'mean_squared_error'"
+        status_8_pyspark_sklearn=$(echo $?)
+        fi
+        
+        if [ $status_9_pyspark_sql -ne 0 ]; then
+        echo "################## start pyspark SQL example"
+        echo "example.9 pyspark sql"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             bash run_spark_on_occlum_glibc.sh pysql | tee test-pyspark-sql-sgx.log && \
+             cat test-pyspark-sql-sgx.log | egrep 'Example API finished'"
+        status_9_pyspark_sql=$(echo $?)
+        fi
+        
+        if [ $status_10_pyspark_tpch -ne 0 ]; then
+        echo "################## start pyspark tpch"
+        echo "example.10 pyspark tpch"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             sed -i 's#/host/data/ /host/data/output/ true#/host/data/tpch/ /host/data/output/ false#g' run_spark_on_occlum_glibc.sh && \
+             bash run_spark_on_occlum_glibc.sh pytpch | tee pyspark-tpch.log && \
+             cat pyspark-tpch.log | egrep 'total time is'"
+        status_10_pyspark_tpch=$(echo $?)
+        fi
+        
+        if [ $status_11_spark_lgbm -ne 0 ]; then
+        echo "################## start spark lgbm"
+        echo "example.11 spark lgbm"
+        docker exec -i $CONTAINER_NAME bash -c "cd /opt && \
+             bash run_spark_on_occlum_glibc.sh lgbm | tee spark-lgbm.log && \
+             cat spark-lgbm.log | egrep 'acc:'"
+        status_11_spark_lgbm=$(echo $?)
+        fi
+
+        echo "status_1_spark_pi $status_1_spark_pi"
+        echo "status_2_bigdl_lenet_mnist $status_2_bigdl_lenet_mnist"
+        echo "status_3_bigdl_resnet_cifar10 $status_3_bigdl_resnet_cifar10"
+        echo "status_4_spark_tpch $status_4_spark_tpch"
+        #echo "status_5_spark_ut $status_5_spark_ut"
+        echo "status_6_spark_xgboost $status_6_spark_xgboost"
+        echo "status_7_spark_gbt $status_7_spark_gbt"
+        echo "status_8_pyspark_sklearn $status_8_pyspark_sklearn"
+        echo "status_9_pyspark_sql $status_9_pyspark_sql"
+        echo "status_10_pyspark_tpch $status_10_pyspark_tpch"
+        echo "status_11_spark_lgbm $status_11_spark_lgbm"
diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
@@ -36,6 +36,10 @@ jobs:
     env:
       THREAD_NUM: 24
     steps:
+      - name: Set environment variables
+        shell: bash
+        run: |
+          echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v2
@@ -55,6 +59,14 @@ jobs:
         env:
           ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
 
+      - name: Download LLMs
+        shell: bash
+        run: |
+          if [ ! -d $LLAMA2_7B_ORIGIN_PATH ]; then
+            echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR
+          fi
+
       - name: Run LLM Performance test
         env:
           ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
@@ -76,17 +88,14 @@ jobs:
       THREAD_NUM: 16
       ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
     steps:
-      - name: Set model directories
-        shell: bash
-        run: |
-          echo "ORIGIN_DIR=/mnt/disk1/models" >> "$GITHUB_ENV"
       - name: Set environment variables
         shell: bash
         run: |
           echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
           echo "LLAMA2_13B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-13b-chat-hf" >> "$GITHUB_ENV"
           echo "CHATGLM2_6B_ORIGIN_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
           echo "WHISPER_MEDIUM_ORIGIN_PATH=${ORIGIN_DIR}/whisper-medium" >> "$GITHUB_ENV"
+
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
@@ -112,6 +121,27 @@ jobs:
         run: |
           source /opt/intel/oneapi/setvars.sh
           bash python/llm/test/run-llm-install-tests.sh
+
+      - name: Download LLMs
+        shell: bash
+        run: |
+          if [ ! -d $LLAMA2_7B_ORIGIN_PATH ]; then
+            echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR
+          fi
+          if [ ! -d $LLAMA2_13B_ORIGIN_PATH ]; then
+            echo "Directory $LLAMA2_13B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-13b-chat-hf -P $ORIGIN_DIR
+          fi
+          if [ ! -d $CHATGLM2_6B_ORIGIN_PATH ]; then
+            echo "Directory $CHATGLM2_6B_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
+          fi
+          if [ ! -d $WHISPER_MEDIUM_ORIGIN_PATH ]; then
+            echo "Directory $WHISPER_MEDIUM_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/whisper-medium -P $ORIGIN_DIR
+          fi
+
       - name: Test on xpu
         shell: bash
         run: |

diff --git a/.github/workflows/nightly_test.yml b/.github/workflows/nightly_test.yml
@@ -3,7 +3,7 @@ name: Nightly Test
 on:
 
   #pull_request:
-    #branches: [ main ]
+  #  branches: [ main ]
 
   schedule:
     - cron: '30 15 * * *' # GMT time, 15:30 GMT == 23:30 China
@@ -68,6 +68,7 @@ on:
         - PPML-Scala-UT
         - PPML-Python-UT-Spark3
         - PPML-Occlum-ExampleTests
+        - PPML-Occlum-EDMM-ExampleTests
         - PPML-spark-Local-SimpleQuery-Tests-on-Gramine
         - PPML-RealTime-ML-Occlum
         - PPML-RealTime-ML-Occlum-K8s
@@ -1402,6 +1403,40 @@ jobs:
         job-name: PPML-Occlum-ExampleTests
         runner-hosted-on: 'Shanghai'
 
+  PPML-Occlum-EDMM-ExampleTests:
+    if: ${{ github.event.inputs.artifact == 'PPML-Occlum-EDMM-ExampleTests' }}
+    runs-on: [self-hosted, EDMM]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up JDK8
+        uses: ./.github/actions/jdk-setup-action
+      - name: Set up maven
+        uses: ./.github/actions/maven-setup-action
+      - name: set env
+        env:
+          DEFAULT_IMAGE: '10.239.45.10/arda/intelanalytics/bigdl-ppml-trusted-big-data-ml-scala-occlum'
+          DEFAULT_TAG: '2.4.0-SNAPSHOT-EDMM'
+        run: |
+          echo "TAG=${{ github.event.inputs.tag || env.DEFAULT_TAG }}" >> $GITHUB_ENV
+          echo "IMAGE=${{ github.event.inputs.image || env.DEFAULT_IMAGE }}" >> $GITHUB_ENV
+      - name: Run Test
+        uses: ./.github/actions/ppml/ppml-occlum-EDMM-exampletests-action
+        with:
+          image: ${{env.IMAGE}}
+          image-tag: ${{env.TAG}}
+      - name: Create Job Badge
+        uses: ./.github/actions/create-job-status-badge
+        if: ${{ always() }}
+        with:
+          secret: ${{ secrets.GIST_SECRET}}
+          gist-id: ${{env.GIST_ID}}
+          is-self-hosted-runner: true
+          file-name: PPML-Occlum-EDMM-ExampleTests.json
+          type: job
+          job-name: PPML-Occlum-EDMM-ExampleTests
+          runner-hosted-on: 'Shanghai'
+
   PPML-RealTime-ML-Occlum:
     if: ${{ github.event.schedule || github.event.inputs.artifact == 'PPML-RealTime-ML-Occlum' || github.event.inputs.artifact == 'all' }} 
     runs-on: [self-hosted, Vilvarin]

diff --git a/README.md b/README.md
@@ -12,8 +12,8 @@
 > *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [ggml](https://github.com/ggerganov/ggml), [gptq](https://github.com/IST-DASLab/gptq), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.*
 
 ### Latest update
-- **[New]** `bigdl-llm` now supports QLoRA fintuning on Intel GPU; see the the example [here](python/llm/example/gpu/qlora_finetuning).
-- `bigdl-llm` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples [here](python/llm/example/gpu).
+- **[New]** `bigdl-llm` now supports QLoRA fintuning on Intel GPU; see the the example [here](python/llm/example/GPU/QLoRA-FineTuning).
+- `bigdl-llm` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples [here](python/llm/example/GPU).
 - `bigdl-llm` tutorial is released [here](https://github.com/intel-analytics/bigdl-llm-tutorial).
 - Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models).
 
@@ -76,7 +76,7 @@ input_ids = tokenizer.encode(input_str, ...)
 output_ids = model.generate(input_ids, ...)
 output = tokenizer.batch_decode(output_ids)
 ```
-*See the complete examples [here](python/llm/example/transformers/transformers_int4/).*
+*See the complete examples [here](python/llm/example/CPU/HF-Transformers-AutoModels/Model).*
 
 #### GPU INT4
 ##### Install
@@ -105,7 +105,7 @@ input_ids = tokenizer.encode(input_str, ...).to('xpu')
 output_ids = model.generate(input_ids, ...)
 output = tokenizer.batch_decode(output_ids.cpu())
 ```
-*See the complete examples [here](python/llm/example/gpu/).*
+*See the complete examples [here](python/llm/example/GPU).*
 
 #### More Low-Bit Support
 ##### Save and load
@@ -115,15 +115,15 @@ After the model is optimized using `bigdl-llm`, you may save and load the model
 model.save_low_bit(model_path)
 new_model = AutoModelForCausalLM.load_low_bit(model_path)
 ```
-*See the complete example [here](python/llm/example/transformers/transformers_low_bit/).*
+*See the complete example [here](python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load).*
 
 ##### Additonal data types
 
 In addition to INT4, You may apply other low bit optimizations (such as *INT8*, *INT5*, *NF4*, etc.) as follows: 
 ```python
 model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_low_bit="sym_int8")
 ```
-*See the complete example [here](python/llm/example/transformers/transformers_low_bit/).*
+*See the complete example [here](python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types).*
 
 
 ***For more details, please refer to the `bigdl-llm` [Document](https://test-bigdl-llm.readthedocs.io/en/main/doc/LLM/index.html), [Readme](python/llm), [Tutorial](https://github.com/intel-analytics/bigdl-llm-tutorial) and [API Doc](https://bigdl.readthedocs.io/en/latest/doc/PythonAPI/LLM/index.html).***