Merge branch 'main' into cicd_release_zip

intel-analytics · Nov 1, 2023 · 16f531a · 16f531a
2 parents f1b25ff + d18e153
commit 16f531a
Show file tree

Hide file tree

Showing 270 changed files with 15,123 additions and 6,140 deletions.
diff --git a/.github/actions/llm/setup-llm-env/action.yml b/.github/actions/llm/setup-llm-env/action.yml
@@ -14,6 +14,9 @@ runs:
         # make sure we install the latest version for bigdl-core-xe
         pip uninstall bigdl-core-xe || true
         sed -i 's/"bigdl-core-xe==" + VERSION + "/"bigdl-core-xe/g' python/llm/setup.py
+        # make sure we install the latest version for bigdl-core-xe-esimd
+        pip uninstall bigdl-core-xe-esimd || true
+        sed -i 's/"bigdl-core-xe-esimd==" + VERSION + "/"bigdl-core-xe-esimd/g' python/llm/setup.py
 
         pip install requests
         if [[ ${{ runner.os }} == 'Linux' ]]; then

diff --git a/.github/actions/orca/orca-exampletest-horovod-py38-spark3-action/nightly-test/action.yml b/.github/actions/orca/orca-exampletest-horovod-py38-spark3-action/nightly-test/action.yml
@@ -21,7 +21,7 @@ runs:
         pip install -i ${GONDOLIN_PIP_MIRROR} --trusted-host ${GONDOLIN_TRUSTED_HOST} tensorboardX==2.1
         pip install -i ${GONDOLIN_PIP_MIRROR} --trusted-host ${GONDOLIN_TRUSTED_HOST} tensorflow==2.3.0
         pip install -i ${GONDOLIN_PIP_MIRROR} --trusted-host ${GONDOLIN_TRUSTED_HOST} tensorflow-estimator==2.3.0
-        pip install -i ${GONDOLIN_PIP_MIRROR} --trusted-host ${GONDOLIN_TRUSTED_HOST} xgboost_ray
+        pip install -i ${GONDOLIN_PIP_MIRROR} --trusted-host ${GONDOLIN_TRUSTED_HOST} xgboost_ray==0.1.8
         pip install -i ${GONDOLIN_PIP_MIRROR} --trusted-host ${GONDOLIN_TRUSTED_HOST} Pillow
         pip install -i ${GONDOLIN_PIP_MIRROR} --trusted-host ${GONDOLIN_TRUSTED_HOST} pyarrow==4.0.1
         pip install -i ${GONDOLIN_PIP_MIRROR} --trusted-host ${GONDOLIN_TRUSTED_HOST} h5py==2.10.0

diff --git a/.github/actions/ppml/ppml-occlum-EDMM-k8s-exampletests-action/action.yaml b/.github/actions/ppml/ppml-occlum-EDMM-k8s-exampletests-action/action.yaml
@@ -0,0 +1,191 @@
+name: 'Run PPML Occlum k8s ExampleTests'
+description: 'Run PPML Occlum k8s ExampleTests'
+inputs:
+  image:
+    description: 'image'
+    required: true
+    default: '10.239.45.10/arda/intelanalytics/bigdl-ppml-trusted-big-data-ml-scala-occlum'
+  image-tag:
+    description: 'image tag'
+    required: true
+    default: '2.4.0-SNAPSHOT-EDMM'
+runs:
+  using: "composite"
+  steps:
+    - name: Run tests
+      shell: bash
+      env:
+        DEFAULT_IMAGE: ${{ inputs.image }}:${{ inputs.image-tag }}
+      run: |
+        whoami
+        export IMAGE=${{ env.DEFAULT_IMAGE }}
+        docker pull $IMAGE
+        docker pull intelanalytics/bigdl-ppml-trusted-big-data-ml-scala-occlum:2.4.0-SNAPSHOT-EDMM
+        export NO_PROXY=10.239.45.10:8081,10.112.231.51,10.239.45.10,172.168.0.205,172.168.0.210
+        export kubernetes_master_url=172.168.0.210
+        export SPARK_HOME=/opt/spark-3.1.2-bin-hadoop3.2
+
+        cd /home/icx/BigDL/ppml/trusted-big-data-ml/scala/docker-occlum/kubernetes
+
+        status_1_spark_pi=1
+        status_2_spark_lr=1
+        status_3_spark_sql=1
+        status_4_spark_gbt=1
+        status_5_spark_gbt_criteo=1
+        status_6_spark_tpch=1
+        status_7_pyspark_sklearn=1
+        status_8_pyspark_sql=1
+        status_9_pyspark_tpch=1
+        status_10_spark_lgbm=1
+
+        if [ $status_1_spark_pi -ne 0 ]; then
+          echo "################## start spark pi"
+          echo "example.1 spark pi"
+          bash run_spark_pi.sh $IMAGE
+          if [ "`kubectl get pods | grep spark-pi-test | awk '{print $1}'`" != "" ]; then
+            pi_driver=`kubectl get pods | grep spark-pi-test | awk '{print $1}' | grep driver`
+          fi
+          kubectl logs $pi_driver | egrep 'Pi is roughly 3'
+          status_1_spark_pi=$(echo $?)
+          if [ "`kubectl get pods | grep spark-pi-test | awk '{print $1}'`" != "" ]; then
+            kubectl delete pods $pi_driver
+          fi
+        fi
+
+        if [ $status_2_spark_lr -ne 0 ]; then
+          echo "################## start spark lr"
+          echo "example.2 spark lr"
+          bash run_spark_lr.sh $IMAGE
+          if [ "`kubectl get pods | grep spark-lr-test | awk '{print $1}'`" != "" ]; then
+            lr_driver=`kubectl get pods | grep spark-lr-test | awk '{print $1}' | grep driver`
+          fi
+          kubectl logs $lr_driver | egrep 'Training data results'
+          status_2_spark_lr=$(echo $?)
+          if [ "`kubectl get pods | grep spark-lr-test | awk '{print $1}'`" != "" ]; then
+            kubectl delete pods $lr_driver
+          fi
+        fi
+
+        if [ $status_3_spark_sql -ne 0 ]; then
+          echo "################## start spark sql"
+          echo "example.3 spark sql"
+          bash run_spark_sql.sh $IMAGE
+          if [ "`kubectl get pods | grep spark-sql-test | awk '{print $1}'`" != "" ]; then
+            sql_driver=`kubectl get pods | grep spark-sql-test | awk '{print $1}' | grep driver`
+          fi
+          kubectl logs $sql_driver | egrep 'Name: Michael'
+          status_3_spark_sql=$(echo $?)
+          if [ "`kubectl get pods | grep spark-sql-test | awk '{print $1}'`" != "" ]; then
+            kubectl delete pods $sql_driver
+          fi
+        fi
+
+        if [ $status_4_spark_gbt -ne 0 ]; then
+          echo "################## start spark gbt"
+          echo "example.4 spark gbt"
+          bash run_spark_gbt.sh $IMAGE
+          if [ "`kubectl get pods | grep spark-gbt-example-test | awk '{print $1}'`" != "" ]; then
+            gbt_example_driver=`kubectl get pods | grep spark-gbt-example-test | awk '{print $1}' | grep driver`
+          fi
+          kubectl logs $gbt_example_driver | egrep 'Training data results'
+          status_4_spark_gbt=$(echo $?)
+          if [ "`kubectl get pods | grep spark-gbt-example-test | awk '{print $1}'`" != "" ]; then
+            kubectl delete pods $gbt_example_driver
+          fi
+        fi
+        
+        if [ $status_5_spark_gbt_criteo -ne 0 ]; then
+          echo "################## start spark gbt criteo"
+          echo "example.5 spark gbt criteo"
+          bash run_spark_gbt_criteo.sh $IMAGE
+          if [ "`kubectl get pods | grep spark-gbt-criteo-test | awk '{print $1}'`" != "" ]; then
+            gbt_criteo_driver=`kubectl get pods | grep spark-gbt-criteo-test | grep driver | awk '{print $1}'`
+          fi
+          kubectl logs $gbt_criteo_driver | egrep 'end time is'
+          status_5_spark_gbt_criteo=$(echo $?)
+          if [ "`kubectl get pods | grep spark-gbt-criteo-test | awk '{print $1}'`" != "" ]; then
+            kubectl delete pods $gbt_criteo_driver
+          fi
+        fi
+        
+        if [ $status_6_spark_tpch -ne 0 ]; then
+          echo "################## start spark tpch"
+          echo "example.6 spark tpch"
+          bash run_spark_tpch.sh $IMAGE
+          if [ "`kubectl get pods | grep spark-tpch-test | awk '{print $1}'`" != "" ]; then
+            tpch_driver=`kubectl get pods | grep spark-tpch-test | awk '{print $1}' | grep driver`
+          fi
+          kubectl logs $tpch_driver | egrep '22 finished-'
+          status_6_spark_tpch=$(echo $?)
+          if [ "`kubectl get pods | grep spark-tpch-test | awk '{print $1}'`" != "" ]; then
+            kubectl delete pods $tpch_driver
+          fi
+        fi
+        
+        if [ $status_7_pyspark_sklearn -ne 0 ]; then
+          echo "################## start pyspark sklearn"
+          echo "example.7 pyspark sklearn"
+          bash run_pyspark_sklearn_example.sh $IMAGE
+          if [ "`kubectl get pods | grep pyspark-sklearn-test | awk '{print $1}'`" != "" ]; then
+            sklearn_driver=`kubectl get pods | grep pyspark-sklearn-test | awk '{print $1}' | grep driver`
+          fi
+          kubectl logs $sklearn_driver | egrep 'mean_squared_error'
+          status_7_pyspark_sklearn=$(echo $?)
+          if [ "`kubectl get pods | grep pyspark-sklearn-test | awk '{print $1}'`" != "" ]; then
+            kubectl delete pods $sklearn_driver
+          fi
+        fi
+        
+        if [ $status_8_pyspark_sql -ne 0 ]; then
+          echo "################## start pyspark SQL example"
+          echo "example.8 pyspark sql"
+          bash run_pyspark_sql_example.sh $IMAGE
+          if [ "`kubectl get pods | grep pyspark-pysql-test | awk '{print $1}'`" != "" ]; then
+            sql_driver=`kubectl get pods | grep pyspark-pysql-test | awk '{print $1}' | grep driver`
+          fi
+          kubectl logs $sql_driver | egrep 'Example API finished'
+          status_8_pyspark_sql=$(echo $?)
+          if [ "`kubectl get pods | grep pyspark-pysql-test | awk '{print $1}'`" != "" ]; then
+            kubectl delete pods $sql_driver
+          fi
+        fi
+        
+        if [ $status_9_pyspark_tpch -ne 0 ]; then
+          echo "################## start pyspark tpch"
+          echo "example.9 pyspark tpch"
+          bash run_pyspark_tpch.sh $IMAGE
+          if [ "`kubectl get pods | grep pyspark-tpch-test | awk '{print $1}'`" != "" ]; then
+            pytpch_driver=`kubectl get pods | grep pyspark-tpch-test | awk '{print $1}' | grep driver`
+          fi
+          kubectl logs $pytpch_driver | egrep 'total time is'
+          status_9_pyspark_tpch=$(echo $?)
+          if [ "`kubectl get pods | grep pyspark-tpch-test | awk '{print $1}'`" != "" ]; then
+            kubectl delete pods $pytpch_driver
+          fi
+        fi
+        
+        if [ $status_10_spark_lgbm -ne 0 ]; then
+          echo "################## start spark lgbm"
+          echo "example.10 spark lgbm"
+          bash run_spark_lgbm.sh $IMAGE
+          if [ "`kubectl get pods | grep spark-lgbm-test | awk '{print $1}'`" != "" ]; then
+            lgbm_driver=`kubectl get pods | grep spark-lgbm-test | awk '{print $1}' | grep driver`
+          fi
+          kubectl logs $lgbm_driver | egrep 'acc:'
+          status_10_spark_lgbm=$(echo $?)
+          if [ "`kubectl get pods | grep spark-lgbm-test | awk '{print $1}'`" != "" ]; then
+            kubectl delete pods $lgbm_driver
+          fi
+        fi
+
+        echo "status_1_spark_pi $status_1_spark_pi"
+        echo "status_2_spark_lr $status_2_spark_lr"
+        echo "status_3_spark_sql $status_3_spark_sql"
+        echo "status_4_spark_gbt $status_4_spark_gbt"
+        echo "status_5_spark_gbt_criteo $status_5_spark_gbt_criteo"
+        echo "status_6_spark_tpch $status_6_spark_tpch"
+        echo "status_7_pyspark_sklearn $status_7_pyspark_sklearn"
+        echo "status_8_pyspark_sql $status_8_pyspark_sql"
+        echo "status_9_pyspark_tpch $status_9_pyspark_tpch"
+        echo "status_10_spark_lgbm $status_10_spark_lgbm"
+
diff --git a/.github/workflows/llm-binary-build.yml b/.github/workflows/llm-binary-build.yml
@@ -60,6 +60,8 @@ jobs:
           yum install -y gcc-toolset-11 cmake git
           conda remove -n python39 --all -y
           conda create -n python39 python=3.9 -y
+          conda remove -n python310 --all -y
+          conda create -n python310 python=3.10 -y
       - uses: actions/checkout@v3
         with:
           repository: "intel-analytics/llm.cpp"
@@ -103,6 +105,18 @@ jobs:
         run: |
           mv src/chatglm/build/main release/main-chatglm_vnni
           mv src/chatglm/build/_C.cpython-39-x86_64-linux-gnu.so release/chatglm_C.cpython-39-x86_64-linux-gnu.so
+      - name: Build Chatglm Py310
+        shell: bash
+        run: |
+          source activate python310 || conda activate python310
+          cd src/chatglm
+          rm -r build
+          scl enable gcc-toolset-11 "cmake -B build"
+          scl enable gcc-toolset-11 "cmake --build build --config Release -j"
+      - name: Move Chatglm binaries Py310
+        shell: bash
+        run: |
+          mv src/chatglm/build/_C.cpython-310-x86_64-linux-gnu.so release/chatglm_C.cpython-310-x86_64-linux-gnu.so
       - name: Archive build files
         uses: actions/upload-artifact@v3
         with:
@@ -114,6 +128,7 @@ jobs:
         run: |
           make clean
           conda remove -n python39 --all -y
+          conda remove -n python310 --all -y
 
   check-linux-avx512-artifact:
     runs-on: ubuntu-latest
@@ -401,6 +416,21 @@ jobs:
         run: |
           mv src/chatglm/build/Release/main.exe release/main-chatglm_vnni.exe
           mv src/chatglm/build/Release/_C.cp39-win_amd64.pyd release/chatglm_C.cp39-win_amd64.pyd
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: Build Chatglm Py310
+        shell: powershell
+        run: |
+          cd src/chatglm
+          rm -r build
+          cmake -DAVXVNNI=ON -B build
+          cmake --build build --config Release -j
+      - name: Move Chatglm binaries Py310
+        shell: powershell
+        run: |
+          mv src/chatglm/build/Release/_C.cp310-win_amd64.pyd release/chatglm_C.cp310-win_amd64.pyd
       - name: Archive build files
         uses: actions/upload-artifact@v3
         with:

diff --git a/.github/workflows/llm_example_tests.yml b/.github/workflows/llm_example_tests.yml
@@ -18,6 +18,8 @@ on:
       - '.github/actions/llm/setup-llm-env/action.yml'
       - '.github/actions/llm/remove-llm-env/action.yml'
       - '.github/actions/llm/download-llm-binary/action.yml'
+      - 'python/llm/dev/test/run-example-tests.sh'
+      - 'python/llm/example/**'
   workflow_dispatch:
   workflow_call:
 

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
@@ -26,6 +26,7 @@ jobs:
   llm-cpp-build:
     uses: ./.github/workflows/llm-binary-build.yml
   llm-performance-test:
+    if: false # skip cpu performance test for now; may add it back with separated runner
     needs: llm-cpp-build
     strategy:
       fail-fast: false
@@ -88,25 +89,25 @@ jobs:
       THREAD_NUM: 16
       ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
     steps:
-      - name: Set environment variables
-        shell: bash
-        run: |
-          echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
-          echo "LLAMA2_13B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-13b-chat-hf" >> "$GITHUB_ENV"
-          echo "CHATGLM2_6B_ORIGIN_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
-          echo "WHISPER_MEDIUM_ORIGIN_PATH=${ORIGIN_DIR}/whisper-medium" >> "$GITHUB_ENV"
-
       - uses: actions/checkout@v3
+
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
+
       - name: Install dependencies
         shell: bash
+        # pip install transformers_stream_generator for model internlm-chat-7b-8k
+        # pip install tiktoken for model Qwen-7B-Chat-10-12
         run: |
           python -m pip install --upgrade pip
-          python -m pip install --upgrade setuptools
           python -m pip install --upgrade wheel
+          python -m pip install --upgrade omegaconf
+          python -m pip install --upgrade pandas
+          python -m pip install --upgrade einops
+          python -m pip install --upgrade transformers_stream_generator
+          python -m pip install --upgrade tiktoken
 
       - name: Download llm binary
         uses: ./.github/actions/llm/download-llm-binary
@@ -122,44 +123,18 @@ jobs:
           source /opt/intel/oneapi/setvars.sh
           bash python/llm/test/run-llm-install-tests.sh
 
-      - name: Download LLMs
-        shell: bash
-        run: |
-          if [ ! -d $LLAMA2_7B_ORIGIN_PATH ]; then
-            echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR
-          fi
-          if [ ! -d $LLAMA2_13B_ORIGIN_PATH ]; then
-            echo "Directory $LLAMA2_13B_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-13b-chat-hf -P $ORIGIN_DIR
-          fi
-          if [ ! -d $CHATGLM2_6B_ORIGIN_PATH ]; then
-            echo "Directory $CHATGLM2_6B_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
-          fi
-          if [ ! -d $WHISPER_MEDIUM_ORIGIN_PATH ]; then
-            echo "Directory $WHISPER_MEDIUM_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/whisper-medium -P $ORIGIN_DIR
-          fi
-
       - name: Test on xpu
         shell: bash
         run: |
           source /opt/intel/oneapi/setvars.sh
           export USE_XETLA=OFF
           export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-          cd python/llm/test/benchmark/gpu
+          mv python/llm/test/benchmark/arc-perf-test.yaml python/llm/dev/benchmark/all-in-one/config.yaml
+          cd python/llm/dev/benchmark/all-in-one
           export http_proxy=${HTTP_PROXY}
           export https_proxy=${HTTPS_PROXY}
-          rm -rf test-result || true
-          mkdir test-result
-          taskset -c 0-$((THREAD_NUM - 1)) python llama2.py --model-dir="${LLAMA2_7B_ORIGIN_PATH}" --input-tokens=32 --max-new-tokens=32 > test-result/llama2_7b-32-32.log
-          taskset -c 0-$((THREAD_NUM - 1)) python llama2.py --model-dir="${LLAMA2_7B_ORIGIN_PATH}" --input-tokens=1024 --max-new-tokens=1024 > test-result/llama2_7b-1024-1024.log
-          taskset -c 0-$((THREAD_NUM - 1)) python llama2.py --model-dir="${LLAMA2_13B_ORIGIN_PATH}" --input-tokens=32 --max-new-tokens=32 > test-result/llama2_13b-32-32.log
-          taskset -c 0-$((THREAD_NUM - 1)) python llama2.py --model-dir="${LLAMA2_13B_ORIGIN_PATH}" --input-tokens=1024 --max-new-tokens=1024 > test-result/llama2_13b-1024-1024.log
-          taskset -c 0-$((THREAD_NUM - 1)) python chatglm2.py --model-dir="${CHATGLM2_6B_ORIGIN_PATH}" --input-tokens=32 --max-new-tokens=32 > test-result/chatglm2_6b-32-32.log
-          taskset -c 0-$((THREAD_NUM - 1)) python chatglm2.py --model-dir="${CHATGLM2_6B_ORIGIN_PATH}" --input-tokens=1024 --max-new-tokens=1024 > test-result/chatglm2_6b-1024-1024.log
-          taskset -c 0-$((THREAD_NUM - 1)) python whisper.py --model-dir="${WHISPER_MEDIUM_ORIGIN_PATH}" > test-result/whisper_medium-default-default.log
-          python ../analyze_log_dir.py --log-dir=./test-result --output-path=./xpu_latency.csv
-          timestamp=`date '+%Y%m%d'`
-          curl -T ./xpu_latency.csv ${LLM_FTP_URL}/llm/ggml-actions/perf/xpu_lantency_$timestamp.csv
+          python run.py
+          curl -T ./*.csv ${LLM_FTP_URL}/llm/ggml-actions/perf/
+          cp ./*.csv /mnt/disk1/nightly_perf/
+          cd ../../../test/benchmark
+          python csv_to_html.py -f /mnt/disk1/nightly_perf/