diff --git a/src/instructlab/eval/evaluator.py b/src/instructlab/eval/evaluator.py index 3bd51d5..c365ec4 100644 --- a/src/instructlab/eval/evaluator.py +++ b/src/instructlab/eval/evaluator.py @@ -5,7 +5,7 @@ class Evaluator: """ Parent class for Evaluators - Atttributes: + Attributes: model_path Path to the model to be evaluated """ diff --git a/src/instructlab/eval/gen_api_answer.py b/src/instructlab/eval/gen_api_answer.py index e02d059..0c995c4 100644 --- a/src/instructlab/eval/gen_api_answer.py +++ b/src/instructlab/eval/gen_api_answer.py @@ -13,7 +13,7 @@ import shortuuid import tqdm -from .common import ( +from common import ( load_questions, temperature_config, chat_completion_openai, diff --git a/src/instructlab/eval/gen_judgment.py b/src/instructlab/eval/gen_judgment.py index 8b8fefb..48d9f39 100644 --- a/src/instructlab/eval/gen_judgment.py +++ b/src/instructlab/eval/gen_judgment.py @@ -8,7 +8,7 @@ import numpy as np from tqdm import tqdm -from instructlab.eval.common import ( +from common import ( load_questions, load_model_answers, load_judge_prompts, diff --git a/src/instructlab/eval/mtbench.py b/src/instructlab/eval/mtbench.py index 4e7de5c..b2ecb69 100644 --- a/src/instructlab/eval/mtbench.py +++ b/src/instructlab/eval/mtbench.py @@ -4,7 +4,7 @@ # Local from .evaluator import Evaluator import instructlab.eval.gen_api_answer as gen_api_answer -import instructlab.eval.gen_judgement as gen_judgement +import instructlab.eval.gen_judgment as gen_judgment class MT_Bench_Evaluator(Evaluator): @@ -27,7 +27,7 @@ def gen_answers(self, answer_file, server_url) -> str: #def judge_answers(self, judge_endpoint) -> tuple: def judge_answers(self, judge_endpoint) -> str: """ - Runs MT-Bench judgement + Runs MT-Bench judgment Returns: overall_score MT-Bench score for the overall model evaluation @@ -35,7 +35,7 @@ def judge_answers(self, judge_endpoint) -> str: """ os.environ['OPENAI_API_BASE'] = judge_endpoint os.environ['OPENAI_API_KEY'] = "NO_API_KEY" - output_file = gen_judgement.run(parallel=40) + output_file = gen_judgment.run(parallel=40) return output_file diff --git a/steps_mt_bench.md b/steps_mt_bench.md index 65dc431..caf22b1 100644 --- a/steps_mt_bench.md +++ b/steps_mt_bench.md @@ -1,5 +1,11 @@ # MT-Bench Broken Down in Eval Scripts (not PR Bench) +You should run with `--tensor-parallel-size ` and possibly increase `--max-model-len` to increase the context length + +```shell +python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --served-model-name granite-7b-lab +``` + ### From justfile: `run_bench` If dry run: @@ -24,6 +30,10 @@ OPENAI_API_KEY="NO_API_KEY" python gen_api_answer.py \ results are in data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl + +### TODO Figure out the right version. Latest fails with openai.types not found +pip install vllm==0.3.3 + ### From justfile: `run_judge` For running judge model with vllm make sure you run with `--served-model-name gpt-4`