intel-analytics · zhentaocc · Nov 11, 2023 · Nov 11, 2023 · Nov 11, 2023 · Nov 11, 2023
@@ -20,25 +20,32 @@ on:
 jobs:
   llm-cpp-build:
     uses: ./.github/workflows/llm-binary-build.yml
-  llm-nightly-harness-test:
+  llm-harness-evalution:
+    timeout-minutes: 1000
     needs: llm-cpp-build
     strategy:
       fail-fast: false
       matrix:
         python-version: ["3.9"]
-        model_name: ["Llama-2-7b-chat-hf"]
+        model_name: ["Mistral-7B-v0.1"]
         task: ["truthfulqa"]
-        precision: ["int4"]
+        precision: ["fp4", "sym_int4", "fp8", mixed_4bit]
+        include:
+          - python-version: "3.9"
+            model_name: "Mistral-7B-v0.1"
+            task: "arc"
+            precision: "mixed_4bit" #options: sym_int4, fp4, nf4, mixed_4bit, fp8
+          - python-version: "3.9"
+            model_name: "Llama-2-7b-chat-hf"
+            task: "mmlu"
+            precision: "fp8" #options: sym_int4, fp4, nf4, mixed_4bit, fp8
+
     runs-on: [self-hosted, llm, accuracy]
     env:
       ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
+      ORIGIN_DIR: /mnt/disk1/models
+      HARNESS_HF_HOME: /mnt/disk1/harness_home
     steps:
-      - name: Set model and dataset directories
-        shell: bash
-        run: |
-          echo "ORIGIN_DIR=/mnt/disk1/models" >> "$GITHUB_ENV"
-          echo "HARNESS_HF_HOME=/mnt/disk1/harness_home" >> "$GITHUB_ENV"
-
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
@@ -60,16 +67,13 @@ jobs:
           extra-dependency: "xpu"
 
       - name: Install harness
+        working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness/
         shell: bash
         run: |
-          cd python/llm/dev/benchmark/harness/
           git clone https://github.com/EleutherAI/lm-evaluation-harness.git
           cd  lm-evaluation-harness
           git checkout e81d3cc
           pip install -e .
-          git apply ../bigdl-llm.patch
-          cd ..
-
 
       - name: Download models and datasets
         shell: bash
@@ -84,18 +88,21 @@ jobs:
             wget -r -nH --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
           fi
 
-      - name: Set datasets env
+      - name: Upgrade packages
         shell: bash
         run: |
-          echo "HF_HOME=$HARNESS_HF_HOME" >> "$GITHUB_ENV"
-          echo "HF_DATASETS=$HARNESS_HF_HOME/datasets" >> "$GITHUB_ENV"
-          echo "HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets" >> "$GITHUB_ENV"
+          pip install --upgrade transformers
 
       - name: Run harness
         shell: bash
+        working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness
+        env:
+          USE_XETLA: OFF
+          SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS: 1
         run: |
-          export USE_XETLA=OFF
-          export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+          export HF_HOME=${HARNESS_HF_HOME}
+          export HF_DATASETS=$HARNESS_HF_HOME/datasets
+          export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets
           source /opt/intel/oneapi/setvars.sh
-          cd python/llm/dev/benchmark/harness
-          python llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --output_dir results/${{ matrix.model_name }} --batch 1
+          python run_llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --batch_size 1 --no_cache
+
diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md
@@ -9,20 +9,18 @@ git clone https://github.com/EleutherAI/lm-evaluation-harness.git
 cd  lm-evaluation-harness
 git checkout e81d3cc
 pip install -e .
-git apply ../bigdl-llm.patch
-cd ..
 ```
 
 ## Run
-run `python llb.py`. `llb.py` combines some arguments in `main.py` to make evalutions easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py).
+run `python run_llb.py`. `run_llb.py` combines some arguments in `main.py` to make evalutions easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py).
 
 ### Evaluation on CPU
 ```python
-python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output
+python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
 ```
 ### Evaluation on Intel GPU
 ```python
-python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output
+python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
 ```
 ## Results
 We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result.
diff --git a/python/llm/dev/benchmark/harness/bigdl-llm.patch b/python/llm/dev/benchmark/harness/bigdl-llm.patch
diff --git a/python/llm/dev/benchmark/harness/bigdl_llm.py b/python/llm/dev/benchmark/harness/bigdl_llm.py
@@ -0,0 +1,121 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import multiprocessing
+
+from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+
+import torch
+from typing import Optional, Union
+from lm_eval.base import BaseLM
+
+from transformers import AutoTokenizer, LlamaTokenizer
+
+def _get_dtype(
+    dtype: Union[str, torch.dtype]
+) -> torch.dtype:
+    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
+    if isinstance(dtype, str) and dtype != "auto":
+        # Convert `str` args torch dtype: `float16` -> `torch.float16`
+        _torch_dtype = getattr(torch, dtype)
+    else:
+        _torch_dtype = dtype
+    return _torch_dtype
+
+class BigDLLM(BaseLM):
+    def __init__(
+        self,
+        device="xpu",
+        pretrained="gpt2",
+        revision="main",
+        low_cpu_mem_usage=None,
+        subfolder=None,
+        tokenizer=None,
+        batch_size=1,
+        load_in_8bit: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = False,
+        load_in_low_bit=None,
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+    ):
+        super().__init__()
+
+        assert isinstance(pretrained, str)
+        assert isinstance(batch_size, (int,str))
+        if device == 'xpu':
+            import intel_extension_for_pytorch as ipex
+        model = AutoModelForCausalLM.from_pretrained(pretrained,
+                                          load_in_low_bit=load_in_low_bit,
+                                          optimize_model=True,
+                                          trust_remote_code=True,
+                                          use_cache=True,
+                                          torch_dtype=_get_dtype(dtype))
+        print(model) # print model to check precision
+        self._device = device
+        self.model = model.to(device)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
+
+        # setup for automatic batch size detection
+        if batch_size == 'auto':
+            self.batch_size_per_gpu = batch_size
+        else:
+            self.batch_size_per_gpu = int(batch_size)
+
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.model.token_eos()
+
+    @property
+    def max_length(self):
+        return 2048  # TODO: how to get this from config
+
+    @property
+    def max_gen_toks(self):
+        return 256
+
+    @property
+    def batch_size(self):
+        # TODO: fix multi-gpu
+        return self.batch_size_per_gpu  # * gpus
+
+    @property
+    def device(self):
+        # TODO: fix multi-gpu
+        return torch.device(self._device)
+
+    def tok_encode(self, string: str):
+        input_ids = self.tokenizer.encode(string)
+        return input_ids
+
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(output[0], skip_special_tokens=True)
+
+    def _model_call(self, inps):
+        """
+        inps: a torch tensor of shape [batch, sequence]
+        the size of sequence may vary from call to call
+
+        returns: a torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model
+        """
+        with torch.inference_mode():
+            inps = inps.to(self.device)
+            res = self.model(inps)[0]
+            return res
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        return self.model(context, max_tokens=max_length, stop=["Q:", "\n"], echo=True)