From d4dffbdb623cffee0bdc5c5a25b3a3300ec31cc4 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Thu, 2 Nov 2023 15:14:19 +0800 Subject: [PATCH] Merge harness (#9319) * add harness patch and llb script * add readme * add license * use patch instead * update readme * rename tests to evaluation * fix typo * remove nano dependency * add original harness link * rename title of usage * rename BigDLGPULM as BigDLLM * empty commit to rerun job --- python/llm/dev/benchmark/harness/README.md | 28 ++++ .../llm/dev/benchmark/harness/bigdl-llm.patch | 151 ++++++++++++++++++ python/llm/dev/benchmark/harness/llb.py | 82 ++++++++++ 3 files changed, 261 insertions(+) create mode 100644 python/llm/dev/benchmark/harness/README.md create mode 100644 python/llm/dev/benchmark/harness/bigdl-llm.patch create mode 100644 python/llm/dev/benchmark/harness/llb.py diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md new file mode 100644 index 00000000000..8187cd5a90d --- /dev/null +++ b/python/llm/dev/benchmark/harness/README.md @@ -0,0 +1,28 @@ +# Harness Evalution +[Harness evalution](https://github.com/EleutherAI/lm-evaluation-harness) allows users to eaisly get accuracy on various datasets. Here we have enabled harness evalution with BigDL-LLM under +[Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) settings. +Before running, make sure to have [bigdl-llm](../../../README.md) installed. + +## Install Harness +```bash +git clone https://github.com/EleutherAI/lm-evaluation-harness.git +cd lm-evaluation-harness +git checkout e81d3cc +pip install -e . +git apply ../bigdl-llm.patch +cd .. +``` + +## Run +run `python llb.py`. `llb.py` combines some arguments in `main.py` to make evalutions easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py). + +### Evaluation on CPU +```python +python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output +``` +### Evaluation on Intel GPU +```python +python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output +``` +## Results +We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result. diff --git a/python/llm/dev/benchmark/harness/bigdl-llm.patch b/python/llm/dev/benchmark/harness/bigdl-llm.patch new file mode 100644 index 00000000000..dbab82fceda --- /dev/null +++ b/python/llm/dev/benchmark/harness/bigdl-llm.patch @@ -0,0 +1,151 @@ +diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py +index 8ca27fac..6b581487 100644 +--- a/lm_eval/models/__init__.py ++++ b/lm_eval/models/__init__.py +@@ -4,6 +4,7 @@ from . import anthropic_llms + from . import huggingface + from . import textsynth + from . import dummy ++from . import bigdl_llm + + MODEL_REGISTRY = { + "hf": gpt2.HFLM, +@@ -15,6 +16,7 @@ MODEL_REGISTRY = { + "anthropic": anthropic_llms.AnthropicLM, + "textsynth": textsynth.TextSynthLM, + "dummy": dummy.DummyLM, ++ "bigdl-llm": bigdl_llm.BigDLLM + } + + +diff --git a/lm_eval/models/bigdl_llm.py b/lm_eval/models/bigdl_llm.py +new file mode 100644 +index 00000000..74010da3 +--- /dev/null ++++ b/lm_eval/models/bigdl_llm.py +@@ -0,0 +1,124 @@ ++# ++# Copyright 2016 The BigDL Authors. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++ ++# this code is copied from llama2 example test, and added performance test ++import os ++import multiprocessing ++ ++from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM ++ ++import torch ++from typing import Optional, Union ++from lm_eval.base import BaseLM ++ ++from transformers import AutoTokenizer, LlamaTokenizer ++ ++def _get_dtype( ++ dtype: Union[str, torch.dtype] ++) -> torch.dtype: ++ """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig""" ++ if isinstance(dtype, str) and dtype != "auto": ++ # Convert `str` args torch dtype: `float16` -> `torch.float16` ++ _torch_dtype = getattr(torch, dtype) ++ else: ++ _torch_dtype = dtype ++ return _torch_dtype ++ ++class BigDLLM(BaseLM): ++ def __init__( ++ self, ++ device="xpu", ++ pretrained="gpt2", ++ revision="main", ++ low_cpu_mem_usage=None, ++ subfolder=None, ++ tokenizer=None, ++ batch_size=1, ++ load_in_8bit: Optional[bool] = False, ++ trust_remote_code: Optional[bool] = False, ++ load_in_low_bit=None, ++ dtype: Optional[Union[str, torch.dtype]] = "auto", ++ ): ++ super().__init__() ++ ++ assert isinstance(pretrained, str) ++ assert isinstance(batch_size, (int,str)) ++ if device == 'xpu': ++ import intel_extension_for_pytorch as ipex ++ model = AutoModelForCausalLM.from_pretrained(pretrained, ++ load_in_low_bit=load_in_low_bit, ++ optimize_model=True, ++ trust_remote_code=True, ++ use_cache=True, ++ torch_dtype=_get_dtype(dtype)) ++ print(model) # print model to check precision ++ self._device = device ++ self.model = model.to(device) ++ ++ self.tokenizer = LlamaTokenizer.from_pretrained(pretrained, trust_remote_code=True) ++ ++ # setup for automatic batch size detection ++ if batch_size == 'auto': ++ self.batch_size_per_gpu = batch_size ++ else: ++ self.batch_size_per_gpu = int(batch_size) ++ ++ @property ++ def eot_token_id(self): ++ # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* ++ return self.model.token_eos() ++ ++ @property ++ def max_length(self): ++ return 2048 # TODO: how to get this from config ++ ++ @property ++ def max_gen_toks(self): ++ return 256 ++ ++ @property ++ def batch_size(self): ++ # TODO: fix multi-gpu ++ return self.batch_size_per_gpu # * gpus ++ ++ @property ++ def device(self): ++ # TODO: fix multi-gpu ++ return torch.device(self._device) ++ ++ def tok_encode(self, string: str): ++ input_ids = self.tokenizer.encode(string) ++ return input_ids ++ ++ def tok_decode(self, tokens): ++ return self.tokenizer.decode(output[0], skip_special_tokens=True) ++ ++ def _model_call(self, inps): ++ """ ++ inps: a torch tensor of shape [batch, sequence] ++ the size of sequence may vary from call to call ++ ++ returns: a torch tensor of shape [batch, sequence, vocab] with the ++ logits returned from the model ++ """ ++ with torch.inference_mode(): ++ inps = inps.to(self.device) ++ res = self.model(inps)[0] ++ return res ++ ++ def _model_generate(self, context, max_length, eos_token_id): ++ return self.model(context, max_tokens=max_length, stop=["Q:", "\n"], echo=True) +\ No newline at end of file diff --git a/python/llm/dev/benchmark/harness/llb.py b/python/llm/dev/benchmark/harness/llb.py new file mode 100644 index 00000000000..a167a3c3f06 --- /dev/null +++ b/python/llm/dev/benchmark/harness/llb.py @@ -0,0 +1,82 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# this code is copied from llama2 example test, and added performance test +import argparse +import os +import subprocess + +task_cmd = "--num_fewshot {} --tasks {}" + +task_map = { + "hellaswag": task_cmd.format(10, "hellaswag"), + "arc": task_cmd.format(25, "arc_challenge"), + "truthfulqa": task_cmd.format(0, "truthfulqa_mc"), + "mmlu": task_cmd.format(5, "hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions") +} + +prec_to_arg = { + "bigdl-llm": { + "int4": "load_in_low_bit=sym_int4", + "nf4": "load_in_low_bit=nf4", + "nf3": "load_in_low_bit=nf3", + "fp8": "load_in_low_bit=fp8", + "fp4": "load_in_low_bit=fp4", + "bf16": "dtype=bfloat16", + "fp16": "dtype=float16", + }, + "hf-causal": { + "nf4": "bnb_type=nf4", + "bf16": "dtype=bfloat16", + "fp16": "dtype=float16", + } +} + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", required=True, type=str) + parser.add_argument("--pretrained", required=True, type=str) + parser.add_argument("--precision", required=True, nargs='+', type=str) + parser.add_argument("--device", required=True, type=str) + parser.add_argument("--batch", default=1, type=int) + parser.add_argument("--tasks", required=True, nargs='+', type=str) + parser.add_argument("--output_dir", type=str) + args = parser.parse_args() + print(args.model) + print(args.tasks) + basic_cmd = "python lm-evaluation-harness/main.py --model {} --model_args pretrained={},{} --no_cache --device {} --batch_size {} {} --output_path {} " + os.makedirs(args.output_dir, exist_ok=True) + index = 1 + total = len(args.precision) * len(args.tasks) + for prec in args.precision: + prec_arg = prec_to_arg[args.model][prec] + for task in args.tasks: + output_path = f"{args.model}_{prec}_{args.device}_{task}" + task_arg = task_map[task] + cmd_exec = basic_cmd.format(args.model, args.pretrained, prec_arg, args.device, args.batch, + task_arg, f"{args.output_dir}/{output_path}") + print(f"Running job {index}/{total}:\n{cmd_exec}") + index += 1 + with open(f"{args.output_dir}/log_{output_path}.txt", "w") as f: + return_code = subprocess.call(cmd_exec, shell=True, stderr=f, stdout=f) + if return_code == 0: + print("Successful") + else: + print("Failed") + +main()