diff --git a/README.md b/README.md index 0e36575..30f8c0b 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ source venv/bin/activate pip install -r requirements.txt pip install -r requirements-dev.txt pip install -e . -pip install vllm==0.3.3 +pip install vllm python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --tensor-parallel-size 1 ``` @@ -65,7 +65,7 @@ eval_output/ ``` ```shell -export INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS=10 # Optional if you want to shorten run times +export INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=40 # Optional if you want to shorten run times python3 tests/test_judge_answers.py python3 tests/test_branch_judge_answers.py ``` diff --git a/requirements.txt b/requirements.txt index eaa9b0c..5174093 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ FastChat GitPython>=3.1.42,<4.0.0 shortuuid -openai<1.0.0 +openai>=1.13.3,<2.0.0 psutil torch transformers diff --git a/src/instructlab/eval/__init__.py b/src/instructlab/eval/__init__.py index 34dd2c1..e69de29 100644 --- a/src/instructlab/eval/__init__.py +++ b/src/instructlab/eval/__init__.py @@ -1,6 +0,0 @@ -# Standard -import os - -openai_api_key = os.environ.get("OPENAI_API_KEY") -if openai_api_key is None: - os.environ["OPENAI_API_KEY"] = "NO_API_KEY" diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index b95f476..a6e8af6 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -50,8 +50,7 @@ def run(self) -> tuple: individual_scores: dict = {} agg_score: float = 0.0 - model_args = f"pretrained= {self.model_path}, dtype= {self.model_dtype}" - + model_args = f"pretrained={self.model_path},dtype={self.model_dtype}" mmlu_output = simple_evaluate( model="hf", model_args=model_args, diff --git a/src/instructlab/eval/mt_bench.py b/src/instructlab/eval/mt_bench.py index 0683167..3a5f12f 100644 --- a/src/instructlab/eval/mt_bench.py +++ b/src/instructlab/eval/mt_bench.py @@ -43,7 +43,7 @@ def gen_answers(self, server_url) -> None: """ mt_bench_answers.generate_answers( self.model_name, - model_api_base=server_url, + server_url, output_dir=self.output_dir, max_workers=self.max_workers, ) @@ -63,8 +63,8 @@ def judge_answers(self, server_url) -> tuple: return mt_bench_judgment.generate_judgment( self.model_name, self.judge_model_name, + server_url, max_workers=self.max_workers, - model_api_base=server_url, output_dir=self.output_dir, ) @@ -113,8 +113,8 @@ def gen_answers(self, server_url) -> None: ) mt_bench_answers.generate_answers( self.model_name, + server_url, branch=self.branch, - model_api_base=server_url, output_dir=self.output_dir, data_dir=self.output_dir, max_workers=self.max_workers, @@ -134,9 +134,9 @@ def judge_answers(self, server_url) -> tuple: _, qa_pairs, _ = mt_bench_judgment.generate_judgment( self.model_name, self.judge_model_name, + server_url, branch=self.branch, max_workers=self.max_workers, - model_api_base=server_url, output_dir=self.output_dir, data_dir=self.output_dir, bench_name="mt_bench_branch", diff --git a/src/instructlab/eval/mt_bench_answers.py b/src/instructlab/eval/mt_bench_answers.py index 3c223a3..890980d 100644 --- a/src/instructlab/eval/mt_bench_answers.py +++ b/src/instructlab/eval/mt_bench_answers.py @@ -41,6 +41,7 @@ def get_answer( max_tokens: int, answer_file: str, force_temperature: float, + openai_client, ): """Answer a question with the model""" assert force_temperature is None or question.get("required_temperature") is None @@ -62,7 +63,9 @@ def get_answer( conv.append_message(conv.roles[0], question["turns"][j]) conv.append_message(conv.roles[1], None) - output = chat_completion_openai(model, conv, temperature, max_tokens) + output = chat_completion_openai( + openai_client, model, conv, temperature, max_tokens + ) conv.update_last_message(output) turns.append(output) @@ -85,6 +88,7 @@ def get_answer( def generate_answers( model_name, + model_api_base, branch=None, output_dir="eval_output", data_dir=None, @@ -94,12 +98,10 @@ def generate_answers( num_choices=1, max_tokens=1024, max_workers=1, - model_api_base=None, bench_name="mt_bench", ): """Generate model answers to be judged""" - if model_api_base is not None: - openai.api_base = model_api_base + openai_client = openai.OpenAI(base_url=model_api_base, api_key="NO_API_KEY") if data_dir is None: data_dir = os.path.join(os.path.dirname(__file__), "data") @@ -125,6 +127,7 @@ def generate_answers( max_tokens, answer_file, force_temperature, + openai_client, ) futures.append(future) diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py index 1cf680f..d8fa602 100644 --- a/src/instructlab/eval/mt_bench_common.py +++ b/src/instructlab/eval/mt_bench_common.py @@ -117,7 +117,7 @@ def load_judge_prompts(prompt_file: str) -> dict: def run_judge_single( - question, answer, judge, ref_answer, multi_turn=False, judgment=None + question, answer, judge, ref_answer, openai_client, multi_turn=False, judgment=None ): kwargs = {} model = judge.model_name @@ -150,7 +150,9 @@ def run_judge_single( conv.append_message(conv.roles[1], None) if judgment is None: - judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048) + judgment = chat_completion_openai( + openai_client, model, conv, temperature=0, max_tokens=2048 + ) if judge.prompt_template["output_format"] == "[[rating]]": match = re.search(one_score_pattern, judgment) @@ -169,7 +171,7 @@ def run_judge_single( return rating, user_prompt, judgment -def play_a_match_single(match: MatchSingle, output_file: str) -> dict: +def play_a_match_single(openai_client, match: MatchSingle, output_file: str) -> dict: question, model, answer, judge, ref_answer, multi_turn = ( match.question, match.model, @@ -186,6 +188,7 @@ def play_a_match_single(match: MatchSingle, output_file: str) -> dict: answer, judge, ref_answer, + openai_client, multi_turn=multi_turn, judgment=judgment, ) @@ -215,10 +218,7 @@ def play_a_match_single(match: MatchSingle, output_file: str) -> dict: return result -def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None) -> str: - if api_dict is not None: - openai.api_base = api_dict["api_base"] - openai.api_key = api_dict["api_key"] +def chat_completion_openai(openai_client, model, conv, temperature, max_tokens) -> str: output = API_ERROR_OUTPUT for _ in range(API_MAX_RETRY): try: @@ -232,16 +232,16 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None) messages[0]["content"] + "\n" + messages[1]["content"] ) messages = messages[1:] - response = openai.ChatCompletion.create( + response = openai_client.chat.completions.create( model=model, messages=messages, n=1, temperature=temperature, max_tokens=max_tokens, ) - output = response["choices"][0]["message"]["content"] + output = response.choices[0].message.content break - except openai.error.OpenAIError as e: + except openai.OpenAIError as e: print(type(e), e) time.sleep(API_RETRY_SLEEP) diff --git a/src/instructlab/eval/mt_bench_judgment.py b/src/instructlab/eval/mt_bench_judgment.py index 576d17f..71c7351 100644 --- a/src/instructlab/eval/mt_bench_judgment.py +++ b/src/instructlab/eval/mt_bench_judgment.py @@ -90,12 +90,15 @@ def make_judgment( if bench_name == "mt_bench": # Second turn df_2 = judgment_df[judgment_df["turn"] == 2].groupby(["model", "turn"]).mean() - turn2_score = df_2["score"].iloc[0] - turn_scores.append(turn2_score) + if len(df_2.index) > 0: + turn2_score = df_2["score"].iloc[0] + turn_scores.append(turn2_score) - # Average - df_3 = judgment_df[["model", "score"]].groupby(["model"]).mean() - overall_score = df_3["score"].iloc[0] + # Average + df_3 = judgment_df[["model", "score"]].groupby(["model"]).mean() + overall_score = df_3["score"].iloc[0] + else: + turn_scores.append("N/A") question_df = pd.read_json(question_file, lines=True) @@ -128,6 +131,7 @@ def make_judgment( def judge_model( model_name, judge_model_name, + openai_client, branch=None, bench_name="mt_bench", output_dir="eval_output", @@ -218,11 +222,11 @@ def judge_model( # Play matches if max_workers == 1: for match in tqdm(matches): - play_a_match_single(match, output_file=output_file) + play_a_match_single(openai_client, match, output_file=output_file) else: def play_a_match_wrapper(match): - play_a_match_single(match, output_file=output_file) + play_a_match_single(openai_client, match, output_file=output_file) np.random.seed(0) np.random.shuffle(matches) @@ -239,6 +243,7 @@ def play_a_match_wrapper(match): def generate_judgment( model_name, judge_model_name, + model_api_base, bench_name="mt_bench", output_dir="eval_output", data_dir=None, @@ -246,19 +251,18 @@ def generate_judgment( model_list=None, max_workers=1, first_n=None, - model_api_base=None, ): """Generate judgment with scores and qa_pairs for a model""" - if model_api_base is not None: - openai.api_base = model_api_base + openai_client = openai.OpenAI(base_url=model_api_base, api_key="NO_API_KEY") - first_n_env = os.environ.get("INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS") + first_n_env = os.environ.get("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS") if first_n_env is not None and first_n is None: first_n = int(first_n_env) question_file, judgment_file, answer_file = judge_model( model_name, judge_model_name, + openai_client, bench_name=bench_name, output_dir=output_dir, data_dir=data_dir, diff --git a/tests/test_branch_judge_answers.py b/tests/test_branch_judge_answers.py index 90a81fb..ada55f0 100644 --- a/tests/test_branch_judge_answers.py +++ b/tests/test_branch_judge_answers.py @@ -14,7 +14,7 @@ print(f"QA Pair 0:") pprint.pprint(qa_pairs[0]) -print(f"base_qa_pairs length: {len(qa_pairs)}") +print(f"qa_pairs length: {len(qa_pairs)}") for qa_pair in qa_pairs: question_id = qa_pair.get("question_id") diff --git a/tests/test_judge_answers.py b/tests/test_judge_answers.py index 2d1742a..4857094 100644 --- a/tests/test_judge_answers.py +++ b/tests/test_judge_answers.py @@ -15,7 +15,7 @@ print(f"QA Pair 0:") pprint.pprint(qa_pairs[0]) -print(f"base_qa_pairs length: {len(qa_pairs)}") +print(f"qa_pairs length: {len(qa_pairs)}") for qa_pair in qa_pairs: assert qa_pair.get("question_id") is not None