Working toward a functional state

Signed-off-by: Dan McPherson <[email protected]>
instructlab · Jun 17, 2024 · b7dab55 · b7dab55
1 parent ad020ef
commit b7dab55
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 6 deletions.
diff --git a/src/instructlab/eval/evaluator.py b/src/instructlab/eval/evaluator.py
@@ -5,7 +5,7 @@ class Evaluator:
     """
     Parent class for Evaluators
 
-    Atttributes:
+    Attributes:
         model_path   Path to the model to be evaluated
     """
 

diff --git a/src/instructlab/eval/gen_api_answer.py b/src/instructlab/eval/gen_api_answer.py
@@ -13,7 +13,7 @@
 import shortuuid
 import tqdm
 
-from .common import (
+from common import (
     load_questions,
     temperature_config,
     chat_completion_openai,

diff --git a/src/instructlab/eval/gen_judgment.py b/src/instructlab/eval/gen_judgment.py
@@ -8,7 +8,7 @@
 import numpy as np
 from tqdm import tqdm
 
-from instructlab.eval.common import (
+from common import (
     load_questions,
     load_model_answers,
     load_judge_prompts,

diff --git a/src/instructlab/eval/mtbench.py b/src/instructlab/eval/mtbench.py
@@ -4,7 +4,7 @@
 # Local
 from .evaluator import Evaluator
 import instructlab.eval.gen_api_answer as gen_api_answer
-import instructlab.eval.gen_judgement as gen_judgement
+import instructlab.eval.gen_judgment as gen_judgment
 
 
 class MT_Bench_Evaluator(Evaluator):
@@ -27,15 +27,15 @@ def gen_answers(self, answer_file, server_url) -> str:
     #def judge_answers(self, judge_endpoint) -> tuple:
     def judge_answers(self, judge_endpoint) -> str:
         """
-        Runs MT-Bench judgement
+        Runs MT-Bench judgment
 
         Returns:
             overall_score   MT-Bench score for the overall model evaluation
             qa_pairs        Question and answer pairs from the evaluation
         """
         os.environ['OPENAI_API_BASE'] = judge_endpoint
         os.environ['OPENAI_API_KEY'] = "NO_API_KEY"
-        output_file = gen_judgement.run(parallel=40)
+        output_file = gen_judgment.run(parallel=40)
         return output_file
 
 

diff --git a/steps_mt_bench.md b/steps_mt_bench.md
@@ -1,5 +1,11 @@
 # MT-Bench Broken Down in Eval Scripts (not PR Bench)
 
+You should run with `--tensor-parallel-size <NUM GPUS>` and possibly increase `--max-model-len` to increase the context length
+
+```shell
+python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --served-model-name granite-7b-lab
+```
+
 ### From justfile: `run_bench`
 
 If dry run:
@@ -24,6 +30,10 @@ OPENAI_API_KEY="NO_API_KEY" python gen_api_answer.py \
 
 results are in data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl
 
+
+### TODO Figure out the right version.  Latest fails with openai.types not found
+pip install vllm==0.3.3
+
 ### From justfile: `run_judge`
 
 For running judge model with vllm make sure you run with `--served-model-name gpt-4`