diff --git a/README.md b/README.md index 0e36575..d1d736b 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ source venv/bin/activate pip install -r requirements.txt pip install -r requirements-dev.txt pip install -e . -pip install vllm==0.3.3 +pip install vllm python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --tensor-parallel-size 1 ``` @@ -65,7 +65,7 @@ eval_output/ ``` ```shell -export INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS=10 # Optional if you want to shorten run times +export INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS=40 # Optional if you want to shorten run times python3 tests/test_judge_answers.py python3 tests/test_branch_judge_answers.py ``` diff --git a/requirements.txt b/requirements.txt index bc02276..836409f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ FastChat GitPython>=3.1.42,<4.0.0 shortuuid -openai<1.0.0 +openai>=1.13.3,<2.0.0 psutil torch transformers diff --git a/src/instructlab/eval/mt_bench_answers.py b/src/instructlab/eval/mt_bench_answers.py index 3c223a3..0b2bb4b 100644 --- a/src/instructlab/eval/mt_bench_answers.py +++ b/src/instructlab/eval/mt_bench_answers.py @@ -41,6 +41,7 @@ def get_answer( max_tokens: int, answer_file: str, force_temperature: float, + openai_client, ): """Answer a question with the model""" assert force_temperature is None or question.get("required_temperature") is None @@ -62,7 +63,9 @@ def get_answer( conv.append_message(conv.roles[0], question["turns"][j]) conv.append_message(conv.roles[1], None) - output = chat_completion_openai(model, conv, temperature, max_tokens) + output = chat_completion_openai( + openai_client, model, conv, temperature, max_tokens + ) conv.update_last_message(output) turns.append(output) @@ -99,7 +102,9 @@ def generate_answers( ): """Generate model answers to be judged""" if model_api_base is not None: - openai.api_base = model_api_base + openai_client = openai.OpenAI(base_url=model_api_base) + else: + openai_client = openai.OpenAI() if data_dir is None: data_dir = os.path.join(os.path.dirname(__file__), "data") @@ -125,6 +130,7 @@ def generate_answers( max_tokens, answer_file, force_temperature, + openai_client, ) futures.append(future) diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py index 1cf680f..d8fa602 100644 --- a/src/instructlab/eval/mt_bench_common.py +++ b/src/instructlab/eval/mt_bench_common.py @@ -117,7 +117,7 @@ def load_judge_prompts(prompt_file: str) -> dict: def run_judge_single( - question, answer, judge, ref_answer, multi_turn=False, judgment=None + question, answer, judge, ref_answer, openai_client, multi_turn=False, judgment=None ): kwargs = {} model = judge.model_name @@ -150,7 +150,9 @@ def run_judge_single( conv.append_message(conv.roles[1], None) if judgment is None: - judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048) + judgment = chat_completion_openai( + openai_client, model, conv, temperature=0, max_tokens=2048 + ) if judge.prompt_template["output_format"] == "[[rating]]": match = re.search(one_score_pattern, judgment) @@ -169,7 +171,7 @@ def run_judge_single( return rating, user_prompt, judgment -def play_a_match_single(match: MatchSingle, output_file: str) -> dict: +def play_a_match_single(openai_client, match: MatchSingle, output_file: str) -> dict: question, model, answer, judge, ref_answer, multi_turn = ( match.question, match.model, @@ -186,6 +188,7 @@ def play_a_match_single(match: MatchSingle, output_file: str) -> dict: answer, judge, ref_answer, + openai_client, multi_turn=multi_turn, judgment=judgment, ) @@ -215,10 +218,7 @@ def play_a_match_single(match: MatchSingle, output_file: str) -> dict: return result -def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None) -> str: - if api_dict is not None: - openai.api_base = api_dict["api_base"] - openai.api_key = api_dict["api_key"] +def chat_completion_openai(openai_client, model, conv, temperature, max_tokens) -> str: output = API_ERROR_OUTPUT for _ in range(API_MAX_RETRY): try: @@ -232,16 +232,16 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None) messages[0]["content"] + "\n" + messages[1]["content"] ) messages = messages[1:] - response = openai.ChatCompletion.create( + response = openai_client.chat.completions.create( model=model, messages=messages, n=1, temperature=temperature, max_tokens=max_tokens, ) - output = response["choices"][0]["message"]["content"] + output = response.choices[0].message.content break - except openai.error.OpenAIError as e: + except openai.OpenAIError as e: print(type(e), e) time.sleep(API_RETRY_SLEEP) diff --git a/src/instructlab/eval/mt_bench_judgment.py b/src/instructlab/eval/mt_bench_judgment.py index 576d17f..24a6a5b 100644 --- a/src/instructlab/eval/mt_bench_judgment.py +++ b/src/instructlab/eval/mt_bench_judgment.py @@ -128,6 +128,7 @@ def make_judgment( def judge_model( model_name, judge_model_name, + openai_client, branch=None, bench_name="mt_bench", output_dir="eval_output", @@ -218,11 +219,11 @@ def judge_model( # Play matches if max_workers == 1: for match in tqdm(matches): - play_a_match_single(match, output_file=output_file) + play_a_match_single(openai_client, match, output_file=output_file) else: def play_a_match_wrapper(match): - play_a_match_single(match, output_file=output_file) + play_a_match_single(openai_client, match, output_file=output_file) np.random.seed(0) np.random.shuffle(matches) @@ -250,7 +251,9 @@ def generate_judgment( ): """Generate judgment with scores and qa_pairs for a model""" if model_api_base is not None: - openai.api_base = model_api_base + openai_client = openai.OpenAI(base_url=model_api_base) + else: + openai_client = openai.OpenAI() first_n_env = os.environ.get("INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS") if first_n_env is not None and first_n is None: @@ -259,6 +262,7 @@ def generate_judgment( question_file, judgment_file, answer_file = judge_model( model_name, judge_model_name, + openai_client, bench_name=bench_name, output_dir=output_dir, data_dir=data_dir,