Working toward a functional state

Signed-off-by: Dan McPherson <[email protected]>
instructlab · Jun 18, 2024 · fbcd6d3 · fbcd6d3
1 parent ad020ef
commit fbcd6d3
Show file tree

Hide file tree

Showing 11 changed files with 39 additions and 310 deletions.
diff --git a/data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl b/data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl
diff --git a/data/mt_bench/model_judgment/gpt-4_single.jsonl b/data/mt_bench/model_judgment/gpt-4_single.jsonl
diff --git a/src/instructlab/eval/common.py b/src/instructlab/eval/common.py
@@ -232,11 +232,10 @@ def play_a_match_single(match: MatchSingle, output_file: str, do_batch: bool=Fal
         match.ref_answer,
         match.multi_turn,
     )
+    create_batch = False
     if do_batch:
         batch_output_file = output_file.replace(".jsonl", "-batch-output.jsonl")
-        if os.path.isfile(batch_output_file):
-            create_batch = False
-        else:
+        if not os.path.isfile(batch_output_file):
             create_batch = True
 
     if judge.prompt_template["type"] == "single":
@@ -432,10 +431,10 @@ def play_a_match_pair(match: MatchPair, output_file: str):
         )
     elif judge.prompt_template["type"] == "single":
         m1_score, m1_user_prompt, m1_judgment = run_judge_single(
-            question, answer_1, judge
+            question, answer_1, judge, ref_answer
         )
         m2_score, m2_user_prompt, m2_judgment = run_judge_single(
-            question, answer_2, judge
+            question, answer_2, judge, ref_answer
         )
 
         if abs(m1_score - m2_score) <= TIE_DELTA:
@@ -471,7 +470,7 @@ def play_a_match_pair(match: MatchPair, output_file: str):
 
     if output_file:
         os.makedirs(os.path.dirname(output_file), exist_ok=True)
-        with open(output_file, "a") as fout:
+        with open(output_file, "a", encoding="utf-8") as fout:
             fout.write(json.dumps(result) + "\n")
 
     return result
@@ -532,12 +531,12 @@ def chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict=
             )
             output = response["choices"][0]["message"]["content"]
             break
-        except openai.error.OpenAIError as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
         except openai.error.InvalidRequestError as e:
             print(type(e), e)
             break
+        except openai.error.OpenAIError as e:
+            print(type(e), e)
+            time.sleep(API_RETRY_SLEEP)
         except KeyError:
             print(response)
             break
@@ -571,32 +570,6 @@ def chat_completion_anthropic(model, conv, temperature, max_tokens, api_dict=Non
     return output.strip()
 
 
-def chat_completion_palm(chat_state, model, conv, temperature, max_tokens):
-    from fastchat.serve.api_provider import init_palm_chat
-
-    assert model == "palm-2-chat-bison-001"
-
-    if chat_state is None:
-        chat_state = init_palm_chat("chat-bison@001")
-
-    parameters = {
-        "temperature": temperature,
-        "top_p": 0.8,
-        "top_k": 40,
-        "max_output_tokens": max_tokens,
-    }
-    output = API_ERROR_OUTPUT
-    for _ in range(API_MAX_RETRY):
-        try:
-            response = chat_state.send_message(conv.messages[-2][1], **parameters)
-            output = response.text
-            break
-        except Exception as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-    return chat_state, output
-
-
 def normalize_game_key_single(gamekey, result):
     """Make the model names sorted in a game key."""
     qid, model_1, model_2 = gamekey
@@ -778,7 +751,6 @@ def check_data(questions, model_answers, ref_answers, models, judges):
         for q in questions:
             if q["category"] not in NEED_REF_CATS:
                 continue
-            print(f"ALI: ref_answers is: {ref_answers}")
             assert (
                 q["question_id"] in ref_answers[jg.model_name]
             ), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"

diff --git a/src/instructlab/eval/evaluator.py b/src/instructlab/eval/evaluator.py
@@ -5,7 +5,7 @@ class Evaluator:
     """
     Parent class for Evaluators
 
-    Atttributes:
+    Attributes:
         model_path   Path to the model to be evaluated
     """
 

diff --git a/src/instructlab/eval/gen_api_answer.py b/src/instructlab/eval/gen_api_answer.py
@@ -18,7 +18,6 @@
     temperature_config,
     chat_completion_openai,
     chat_completion_anthropic,
-    chat_completion_palm,
     ANTHROPIC_MODEL_LIST,
 )
 #TODO need to look into this dependency
@@ -65,10 +64,6 @@ def get_answer(
 
             if model in ANTHROPIC_MODEL_LIST:
                 output = chat_completion_anthropic(model, conv, temperature, max_tokens)
-            elif model == "palm-2-chat-bison-001":
-                chat_state, output = chat_completion_palm(
-                    chat_state, model, conv, temperature, max_tokens
-                )
             else:
                 output = chat_completion_openai(model, conv, temperature, max_tokens)
 

diff --git a/src/instructlab/eval/gen_judgment.py b/src/instructlab/eval/gen_judgment.py
@@ -5,10 +5,11 @@
 from concurrent.futures import ThreadPoolExecutor
 import json
 
+import openai
 import numpy as np
 from tqdm import tqdm
 
-from instructlab.eval.common import (
+from .common import (
     load_questions,
     load_model_answers,
     load_judge_prompts,
@@ -175,7 +176,11 @@ def run(
         parallel=1,
         first_n=None,
         yes=True,
-        batch=True):
+        batch=False,
+        openai_api_base=None):
+
+    if openai_api_base is not None:
+        openai.api_base = openai_api_base
 
     question_file = f"data/{bench_name}/question.jsonl"
     answer_dir = f"data/{bench_name}/model_answer/instructlab"

diff --git a/src/instructlab/eval/mtbench.py b/src/instructlab/eval/mtbench.py
@@ -4,7 +4,7 @@
 # Local
 from .evaluator import Evaluator
 import instructlab.eval.gen_api_answer as gen_api_answer
-import instructlab.eval.gen_judgement as gen_judgement
+import instructlab.eval.gen_judgment as gen_judgment
 
 
 class MT_Bench_Evaluator(Evaluator):
@@ -18,24 +18,23 @@ class MT_Bench_Evaluator(Evaluator):
     def __init__(self, server_url: str) -> None:
         self.server_url = server_url
 
-    def gen_answers(self, answer_file, server_url) -> str:
+    def gen_answers(self, server_url) -> str:
         """ Asks questions to model, returns path to answers"""
         os.environ['OPENAI_API_KEY'] = "NO_API_KEY"
-        gen_api_answer.run(answer_file=answer_file, model_name="instructlab/granite-7b-lab", openai_api_base=server_url)
-        return answer_file
+        gen_api_answer.run(model_name="instructlab/granite-7b-lab", openai_api_base=server_url)
 
     #def judge_answers(self, judge_endpoint) -> tuple:
     def judge_answers(self, judge_endpoint) -> str:
         """
-        Runs MT-Bench judgement
+        Runs MT-Bench judgment
 
         Returns:
             overall_score   MT-Bench score for the overall model evaluation
             qa_pairs        Question and answer pairs from the evaluation
         """
-        os.environ['OPENAI_API_BASE'] = judge_endpoint
+        #os.environ['OPENAI_API_BASE'] = judge_endpoint
         os.environ['OPENAI_API_KEY'] = "NO_API_KEY"
-        output_file = gen_judgement.run(parallel=40)
+        output_file = gen_judgment.run(parallel=40, openai_api_base=judge_endpoint)
         return output_file
 
 

diff --git a/src/instructlab/eval/show_result.py b/src/instructlab/eval/show_result.py
@@ -112,8 +112,8 @@ def display_result_pairwise(args):
         choices=["pairwise-baseline", "pairwise-all", "single"],
         help=(
             "Evaluation mode. "
-            "`pairwise-baseline` runs pairwise comparision against a baseline. "
-            "`pairwise-all` runs pairwise comparision between all pairs. "
+            "`pairwise-baseline` runs pairwise comparison against a baseline. "
+            "`pairwise-all` runs pairwise comparison between all pairs. "
             "`single` runs single answer grading."
         ),
     )

diff --git a/steps_mt_bench.md b/steps_mt_bench.md
@@ -1,25 +1,21 @@
 # MT-Bench Broken Down in Eval Scripts (not PR Bench)
 
-### From justfile: `run_bench`
+## TODO Figure out the right version.  Latest fails with openai.types not found
+
+pip install vllm==0.3.3
+
+You should run with `--tensor-parallel-size <NUM GPUS>` and possibly increase `--max-model-len` to increase the context length
 
-If dry run:
 ```shell
-OPENAI_API_KEY="NO_API_KEY" python gen_api_answer.py \
---bench-name mt_bench \
---openai-api-base http://localhost:8000/v1 \
---model granite-7b-lab \
---num-choices 1 \
---question-begin 2 \
---question-end 4
+python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab
 ```
 
+## From justfile: `run_bench`
+
 If not dry run
+
 ```shell
-OPENAI_API_KEY="NO_API_KEY" python gen_api_answer.py \
---bench-name mt_bench \
---openai-api-base http://localhost:8000/v1 \
---model granite-7b-lab \
---num-choices 1
+OPENAI_API_KEY="NO_API_KEY" python3 test_gen_answers.py
 ```
 
 results are in data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl
@@ -35,7 +31,7 @@ python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab
 ```
 
 ```shell
-OPENAI_API_BASE=http://0.0.0.0:8000/v1 OPENAI_API_KEY="NO_API_KEY" python src/instructlab/eval/gen_judgment.py --bench-name mt_bench --parallel 40 --yes
+OPENAI_API_KEY="NO_API_KEY" python3 test_judge_answers.py 
 ```
 
 results are in data/mt_bench/model_judgment/gpt-4_single.jsonl

diff --git a/test_gen_answers.py b/test_gen_answers.py
@@ -0,0 +1,4 @@
+from instructlab.eval.mtbench import MT_Bench_Evaluator
+
+mt_bench = MT_Bench_Evaluator(server_url="http://localhost:8000")
+mt_bench.gen_answers("http://localhost:8000/v1")
diff --git a/test_mt_bench.py → test_judge_answers.py b/test_mt_bench.py → test_judge_answers.py
@@ -1,7 +1,5 @@
 from instructlab.eval.mtbench import MT_Bench_Evaluator
 
 mt_bench = MT_Bench_Evaluator(server_url="http://localhost:8000")
-#path = mt_bench.gen_answers("test-answers.jsonl", "http://localhost:8000/v1")
-#print(path)
 output_file = mt_bench.judge_answers("http://localhost:8000/v1")
 print(output_file)