intel · zhentaocc · Nov 13, 2023 · Nov 13, 2023 · Nov 13, 2023 · Nov 13, 2023
@@ -20,25 +20,52 @@ on:
 jobs:
   llm-cpp-build:
     uses: ./.github/workflows/llm-binary-build.yml
-  llm-nightly-harness-test:
+  llm-harness-evalution:
+    timeout-minutes: 1000
     needs: llm-cpp-build
     strategy:
       fail-fast: false
       matrix:
+        # include:
+        #   python-version: "3.9"
+        #   model_name: "stablelm-3b-4e1t"
+        #   task: "arc"
+        #   precision: "sym_int4" #options: sym_int4, fp4, nf4, mixed_4bit, fp8
         python-version: ["3.9"]
-        model_name: [stablelm-3b-4e1t]
-        task: ["truthfulqa"]
-        precision: ["int4"]
-    runs-on: [self-hosted, llm, accuracy, temp-arc01]
+        model_name: [falcon-7b-instruct-with-patch, Mistral-7B-v0.1]
+        task: [truthfulqa, arc, hellaswag, mmlu]
+        precision: [mixed_fp8]
+        include:
+          - python-version: "3.9"
+            model_name: falcon-7b-instruct-with-patch
+            task: arc
+            precision: sym_int4 #options: sym_int4, fp4, nf4, mixed_4bit, fp8
+          - python-version: "3.9"
+            model_name: falcon-7b-instruct-with-patch
+            task: arc
+            precision: fp4 #options: sym_int4, fp4, nf4, mixed_4bit, fp8
+          - python-version: "3.9"
+            model_name: falcon-7b-instruct-with-patch
+            task: arc
+            precision: fp8 #options: sym_int4, fp4, nf4, mixed_4bit, fp8
+          - python-version: "3.9"
+            model_name: falcon-7b-instruct-with-patch
+            task: arc
+            precision: mixed_fp4 #options: sym_int4, fp4, nf4, mixed_4bit, fp8
+          - python-version: "3.9"
+            model_name: Llama-2-7b-chat-hf
+            task: mmlu
+            precision: fp8 #options: sym_int4, fp4, nf4, mixed_4bit, fp8
+          - python-version: "3.9"
+            model_name: Llama-2-7b-chat-hf
+            task: hellaswag
+            precision: fp8 #options: sym_int4, fp4, nf4, mixed_4bit, fp8
+    runs-on: [self-hosted, llm, accuracy]
     env:
       ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
+      ORIGIN_DIR: /mnt/disk1/models
+      HARNESS_HF_HOME: /mnt/disk1/harness_home
     steps:
-      - name: Set model and dataset directories
-        shell: bash
-        run: |
-          echo "ORIGIN_DIR=/mnt/disk1/models" >> "$GITHUB_ENV"
-          echo "HARNESS_HF_HOME=/mnt/disk1/harness_home" >> "$GITHUB_ENV"
-
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
@@ -60,16 +87,13 @@ jobs:
           extra-dependency: "xpu"
 
       - name: Install harness
+        working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness/
         shell: bash
         run: |
-          cd python/llm/dev/benchmark/harness/
           git clone https://github.com/EleutherAI/lm-evaluation-harness.git
           cd  lm-evaluation-harness
           git checkout e81d3cc
           pip install -e .
-          git apply ../bigdl-llm.patch
-          cd ..
-
 
       - name: Download models and datasets
         shell: bash
@@ -84,17 +108,21 @@ jobs:
             wget -r -nH --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
           fi
 
-      - name: Set datasets env
+      - name: Upgrade packages
         shell: bash
         run: |
-          echo "HF_HOME=$HARNESS_HF_HOME" >> "$GITHUB_ENV"
-          echo "HF_DATASETS=$HARNESS_HF_HOME/datasets" >> "$GITHUB_ENV"
-          echo "HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets" >> "$GITHUB_ENV"
+          pip install --upgrade transformers
 
       - name: Run harness
         shell: bash
+        working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness
+        env:
+          USE_XETLA: OFF
+          # SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS: 1
         run: |
-          export USE_XETLA=OFF
+          export HF_HOME=${HARNESS_HF_HOME}
+          export HF_DATASETS=$HARNESS_HF_HOME/datasets
+          export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets
           source /opt/intel/oneapi/setvars.sh
-          cd python/llm/dev/benchmark/harness
-          python llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --output_dir results/${{ matrix.model_name }} --batch 1
+          python run_llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --batch_size 1 --no_cache
+
diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md
@@ -9,20 +9,18 @@ git clone https://github.com/EleutherAI/lm-evaluation-harness.git
 cd  lm-evaluation-harness
 git checkout e81d3cc
 pip install -e .
-git apply ../bigdl-llm.patch
-cd ..
 ```
 
 ## Run
-run `python llb.py`. `llb.py` combines some arguments in `main.py` to make evalutions easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py).
+run `python run_llb.py`. `run_llb.py` combines some arguments in `main.py` to make evalutions easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py).
 
 ### Evaluation on CPU
 ```python
-python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output
+python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
 ```
 ### Evaluation on Intel GPU
 ```python
-python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output
+python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
 ```
 ## Results
 We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result.
diff --git a/python/llm/dev/benchmark/harness/bigdl-llm.patch b/python/llm/dev/benchmark/harness/bigdl-llm.patch