Updating to > 1.0 openai

Signed-off-by: Dan McPherson <[email protected]>
instructlab · Jun 26, 2024 · d45fc4d · d45fc4d
1 parent 100c512
commit d45fc4d
Show file tree

Hide file tree

Showing 10 changed files with 41 additions and 41 deletions.
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ source venv/bin/activate
 pip install -r requirements.txt
 pip install -r requirements-dev.txt
 pip install -e .
-pip install vllm==0.3.3
+pip install vllm
 python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --tensor-parallel-size 1
 ```
 
@@ -65,7 +65,7 @@ eval_output/
 ```
 
 ```shell
-export INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS=10 # Optional if you want to shorten run times
+export INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS=40 # Optional if you want to shorten run times
 python3 tests/test_judge_answers.py
 python3 tests/test_branch_judge_answers.py
 ```

diff --git a/requirements.txt b/requirements.txt
@@ -2,7 +2,7 @@
 FastChat
 GitPython>=3.1.42,<4.0.0
 shortuuid
-openai<1.0.0
+openai>=1.13.3,<2.0.0
 psutil
 torch
 transformers

diff --git a/src/instructlab/eval/__init__.py b/src/instructlab/eval/__init__.py
@@ -1,6 +0,0 @@
-# Standard
-import os
-
-openai_api_key = os.environ.get("OPENAI_API_KEY")
-if openai_api_key is None:
-    os.environ["OPENAI_API_KEY"] = "NO_API_KEY"

diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
@@ -50,8 +50,7 @@ def run(self) -> tuple:
 
         individual_scores: dict = {}
         agg_score: float = 0.0
-        model_args = f"pretrained= {self.model_path}, dtype= {self.model_dtype}"
-
+        model_args = f"pretrained={self.model_path}, dtype={self.model_dtype}"
         mmlu_output = simple_evaluate(
             model="hf",
             model_args=model_args,

diff --git a/src/instructlab/eval/mt_bench.py b/src/instructlab/eval/mt_bench.py
@@ -43,7 +43,7 @@ def gen_answers(self, server_url) -> None:
         """
         mt_bench_answers.generate_answers(
             self.model_name,
-            model_api_base=server_url,
+            server_url,
             output_dir=self.output_dir,
             max_workers=self.max_workers,
         )
@@ -63,8 +63,8 @@ def judge_answers(self, server_url) -> tuple:
         return mt_bench_judgment.generate_judgment(
             self.model_name,
             self.judge_model_name,
+            server_url,
             max_workers=self.max_workers,
-            model_api_base=server_url,
             output_dir=self.output_dir,
         )
 
@@ -113,8 +113,8 @@ def gen_answers(self, server_url) -> None:
         )
         mt_bench_answers.generate_answers(
             self.model_name,
+            server_url,
             branch=self.branch,
-            model_api_base=server_url,
             output_dir=self.output_dir,
             data_dir=self.output_dir,
             max_workers=self.max_workers,
@@ -134,9 +134,9 @@ def judge_answers(self, server_url) -> tuple:
         _, qa_pairs, _ = mt_bench_judgment.generate_judgment(
             self.model_name,
             self.judge_model_name,
+            server_url,
             branch=self.branch,
             max_workers=self.max_workers,
-            model_api_base=server_url,
             output_dir=self.output_dir,
             data_dir=self.output_dir,
             bench_name="mt_bench_branch",

diff --git a/src/instructlab/eval/mt_bench_answers.py b/src/instructlab/eval/mt_bench_answers.py
@@ -41,6 +41,7 @@ def get_answer(
     max_tokens: int,
     answer_file: str,
     force_temperature: float,
+    openai_client,
 ):
     """Answer a question with the model"""
     assert force_temperature is None or question.get("required_temperature") is None
@@ -62,7 +63,9 @@ def get_answer(
             conv.append_message(conv.roles[0], question["turns"][j])
             conv.append_message(conv.roles[1], None)
 
-            output = chat_completion_openai(model, conv, temperature, max_tokens)
+            output = chat_completion_openai(
+                openai_client, model, conv, temperature, max_tokens
+            )
 
             conv.update_last_message(output)
             turns.append(output)
@@ -85,6 +88,7 @@ def get_answer(
 
 def generate_answers(
     model_name,
+    model_api_base,
     branch=None,
     output_dir="eval_output",
     data_dir=None,
@@ -94,12 +98,10 @@ def generate_answers(
     num_choices=1,
     max_tokens=1024,
     max_workers=1,
-    model_api_base=None,
     bench_name="mt_bench",
 ):
     """Generate model answers to be judged"""
-    if model_api_base is not None:
-        openai.api_base = model_api_base
+    openai_client = openai.OpenAI(base_url=model_api_base, api_key="NO_API_KEY")
 
     if data_dir is None:
         data_dir = os.path.join(os.path.dirname(__file__), "data")
@@ -125,6 +127,7 @@ def generate_answers(
                 max_tokens,
                 answer_file,
                 force_temperature,
+                openai_client,
             )
             futures.append(future)
 

diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py
@@ -117,7 +117,7 @@ def load_judge_prompts(prompt_file: str) -> dict:
 
 
 def run_judge_single(
-    question, answer, judge, ref_answer, multi_turn=False, judgment=None
+    question, answer, judge, ref_answer, openai_client, multi_turn=False, judgment=None
 ):
     kwargs = {}
     model = judge.model_name
@@ -150,7 +150,9 @@ def run_judge_single(
     conv.append_message(conv.roles[1], None)
 
     if judgment is None:
-        judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048)
+        judgment = chat_completion_openai(
+            openai_client, model, conv, temperature=0, max_tokens=2048
+        )
 
     if judge.prompt_template["output_format"] == "[[rating]]":
         match = re.search(one_score_pattern, judgment)
@@ -169,7 +171,7 @@ def run_judge_single(
     return rating, user_prompt, judgment
 
 
-def play_a_match_single(match: MatchSingle, output_file: str) -> dict:
+def play_a_match_single(openai_client, match: MatchSingle, output_file: str) -> dict:
     question, model, answer, judge, ref_answer, multi_turn = (
         match.question,
         match.model,
@@ -186,6 +188,7 @@ def play_a_match_single(match: MatchSingle, output_file: str) -> dict:
             answer,
             judge,
             ref_answer,
+            openai_client,
             multi_turn=multi_turn,
             judgment=judgment,
         )
@@ -215,10 +218,7 @@ def play_a_match_single(match: MatchSingle, output_file: str) -> dict:
     return result
 
 
-def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None) -> str:
-    if api_dict is not None:
-        openai.api_base = api_dict["api_base"]
-        openai.api_key = api_dict["api_key"]
+def chat_completion_openai(openai_client, model, conv, temperature, max_tokens) -> str:
     output = API_ERROR_OUTPUT
     for _ in range(API_MAX_RETRY):
         try:
@@ -232,16 +232,16 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None)
                     messages[0]["content"] + "\n" + messages[1]["content"]
                 )
                 messages = messages[1:]
-            response = openai.ChatCompletion.create(
+            response = openai_client.chat.completions.create(
                 model=model,
                 messages=messages,
                 n=1,
                 temperature=temperature,
                 max_tokens=max_tokens,
             )
-            output = response["choices"][0]["message"]["content"]
+            output = response.choices[0].message.content
             break
-        except openai.error.OpenAIError as e:
+        except openai.OpenAIError as e:
             print(type(e), e)
             time.sleep(API_RETRY_SLEEP)
 

diff --git a/src/instructlab/eval/mt_bench_judgment.py b/src/instructlab/eval/mt_bench_judgment.py
@@ -90,12 +90,15 @@ def make_judgment(
     if bench_name == "mt_bench":
         # Second turn
         df_2 = judgment_df[judgment_df["turn"] == 2].groupby(["model", "turn"]).mean()
-        turn2_score = df_2["score"].iloc[0]
-        turn_scores.append(turn2_score)
+        if len(df_2.index) > 0:
+            turn2_score = df_2["score"].iloc[0]
+            turn_scores.append(turn2_score)
 
-        # Average
-        df_3 = judgment_df[["model", "score"]].groupby(["model"]).mean()
-        overall_score = df_3["score"].iloc[0]
+            # Average
+            df_3 = judgment_df[["model", "score"]].groupby(["model"]).mean()
+            overall_score = df_3["score"].iloc[0]
+        else:
+            turn_scores.append("N/A")
 
     question_df = pd.read_json(question_file, lines=True)
 
@@ -128,6 +131,7 @@ def make_judgment(
 def judge_model(
     model_name,
     judge_model_name,
+    openai_client,
     branch=None,
     bench_name="mt_bench",
     output_dir="eval_output",
@@ -218,11 +222,11 @@ def judge_model(
     # Play matches
     if max_workers == 1:
         for match in tqdm(matches):
-            play_a_match_single(match, output_file=output_file)
+            play_a_match_single(openai_client, match, output_file=output_file)
     else:
 
         def play_a_match_wrapper(match):
-            play_a_match_single(match, output_file=output_file)
+            play_a_match_single(openai_client, match, output_file=output_file)
 
         np.random.seed(0)
         np.random.shuffle(matches)
@@ -239,18 +243,17 @@ def play_a_match_wrapper(match):
 def generate_judgment(
     model_name,
     judge_model_name,
+    model_api_base,
     bench_name="mt_bench",
     output_dir="eval_output",
     data_dir=None,
     branch=None,
     model_list=None,
     max_workers=1,
     first_n=None,
-    model_api_base=None,
 ):
     """Generate judgment with scores and qa_pairs for a model"""
-    if model_api_base is not None:
-        openai.api_base = model_api_base
+    openai_client = openai.OpenAI(base_url=model_api_base, api_key="NO_API_KEY")
 
     first_n_env = os.environ.get("INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS")
     if first_n_env is not None and first_n is None:
@@ -259,6 +262,7 @@ def generate_judgment(
     question_file, judgment_file, answer_file = judge_model(
         model_name,
         judge_model_name,
+        openai_client,
         bench_name=bench_name,
         output_dir=output_dir,
         data_dir=data_dir,

diff --git a/tests/test_branch_judge_answers.py b/tests/test_branch_judge_answers.py
@@ -14,7 +14,7 @@
 print(f"QA Pair 0:")
 pprint.pprint(qa_pairs[0])
 
-print(f"base_qa_pairs length: {len(qa_pairs)}")
+print(f"qa_pairs length: {len(qa_pairs)}")
 
 for qa_pair in qa_pairs:
     question_id = qa_pair.get("question_id")

diff --git a/tests/test_judge_answers.py b/tests/test_judge_answers.py
@@ -15,7 +15,7 @@
 print(f"QA Pair 0:")
 pprint.pprint(qa_pairs[0])
 
-print(f"base_qa_pairs length: {len(qa_pairs)}")
+print(f"qa_pairs length: {len(qa_pairs)}")
 
 for qa_pair in qa_pairs:
     assert qa_pair.get("question_id") is not None