Skip to content

Commit

Permalink
Updating to > 1.0 openai
Browse files Browse the repository at this point in the history
Signed-off-by: Dan McPherson <[email protected]>
  • Loading branch information
danmcp committed Jun 26, 2024
1 parent 100c512 commit d45fc4d
Show file tree
Hide file tree
Showing 10 changed files with 41 additions and 41 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ source venv/bin/activate
pip install -r requirements.txt
pip install -r requirements-dev.txt
pip install -e .
pip install vllm==0.3.3
pip install vllm
python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --tensor-parallel-size 1
```

Expand Down Expand Up @@ -65,7 +65,7 @@ eval_output/
```

```shell
export INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS=10 # Optional if you want to shorten run times
export INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS=40 # Optional if you want to shorten run times
python3 tests/test_judge_answers.py
python3 tests/test_branch_judge_answers.py
```
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
FastChat
GitPython>=3.1.42,<4.0.0
shortuuid
openai<1.0.0
openai>=1.13.3,<2.0.0
psutil
torch
transformers
Expand Down
6 changes: 0 additions & 6 deletions src/instructlab/eval/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +0,0 @@
# Standard
import os

openai_api_key = os.environ.get("OPENAI_API_KEY")
if openai_api_key is None:
os.environ["OPENAI_API_KEY"] = "NO_API_KEY"
3 changes: 1 addition & 2 deletions src/instructlab/eval/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ def run(self) -> tuple:

individual_scores: dict = {}
agg_score: float = 0.0
model_args = f"pretrained= {self.model_path}, dtype= {self.model_dtype}"

model_args = f"pretrained={self.model_path}, dtype={self.model_dtype}"
mmlu_output = simple_evaluate(
model="hf",
model_args=model_args,
Expand Down
8 changes: 4 additions & 4 deletions src/instructlab/eval/mt_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def gen_answers(self, server_url) -> None:
"""
mt_bench_answers.generate_answers(
self.model_name,
model_api_base=server_url,
server_url,
output_dir=self.output_dir,
max_workers=self.max_workers,
)
Expand All @@ -63,8 +63,8 @@ def judge_answers(self, server_url) -> tuple:
return mt_bench_judgment.generate_judgment(
self.model_name,
self.judge_model_name,
server_url,
max_workers=self.max_workers,
model_api_base=server_url,
output_dir=self.output_dir,
)

Expand Down Expand Up @@ -113,8 +113,8 @@ def gen_answers(self, server_url) -> None:
)
mt_bench_answers.generate_answers(
self.model_name,
server_url,
branch=self.branch,
model_api_base=server_url,
output_dir=self.output_dir,
data_dir=self.output_dir,
max_workers=self.max_workers,
Expand All @@ -134,9 +134,9 @@ def judge_answers(self, server_url) -> tuple:
_, qa_pairs, _ = mt_bench_judgment.generate_judgment(
self.model_name,
self.judge_model_name,
server_url,
branch=self.branch,
max_workers=self.max_workers,
model_api_base=server_url,
output_dir=self.output_dir,
data_dir=self.output_dir,
bench_name="mt_bench_branch",
Expand Down
11 changes: 7 additions & 4 deletions src/instructlab/eval/mt_bench_answers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def get_answer(
max_tokens: int,
answer_file: str,
force_temperature: float,
openai_client,
):
"""Answer a question with the model"""
assert force_temperature is None or question.get("required_temperature") is None
Expand All @@ -62,7 +63,9 @@ def get_answer(
conv.append_message(conv.roles[0], question["turns"][j])
conv.append_message(conv.roles[1], None)

output = chat_completion_openai(model, conv, temperature, max_tokens)
output = chat_completion_openai(
openai_client, model, conv, temperature, max_tokens
)

conv.update_last_message(output)
turns.append(output)
Expand All @@ -85,6 +88,7 @@ def get_answer(

def generate_answers(
model_name,
model_api_base,
branch=None,
output_dir="eval_output",
data_dir=None,
Expand All @@ -94,12 +98,10 @@ def generate_answers(
num_choices=1,
max_tokens=1024,
max_workers=1,
model_api_base=None,
bench_name="mt_bench",
):
"""Generate model answers to be judged"""
if model_api_base is not None:
openai.api_base = model_api_base
openai_client = openai.OpenAI(base_url=model_api_base, api_key="NO_API_KEY")

if data_dir is None:
data_dir = os.path.join(os.path.dirname(__file__), "data")
Expand All @@ -125,6 +127,7 @@ def generate_answers(
max_tokens,
answer_file,
force_temperature,
openai_client,
)
futures.append(future)

Expand Down
20 changes: 10 additions & 10 deletions src/instructlab/eval/mt_bench_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def load_judge_prompts(prompt_file: str) -> dict:


def run_judge_single(
question, answer, judge, ref_answer, multi_turn=False, judgment=None
question, answer, judge, ref_answer, openai_client, multi_turn=False, judgment=None
):
kwargs = {}
model = judge.model_name
Expand Down Expand Up @@ -150,7 +150,9 @@ def run_judge_single(
conv.append_message(conv.roles[1], None)

if judgment is None:
judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048)
judgment = chat_completion_openai(
openai_client, model, conv, temperature=0, max_tokens=2048
)

if judge.prompt_template["output_format"] == "[[rating]]":
match = re.search(one_score_pattern, judgment)
Expand All @@ -169,7 +171,7 @@ def run_judge_single(
return rating, user_prompt, judgment


def play_a_match_single(match: MatchSingle, output_file: str) -> dict:
def play_a_match_single(openai_client, match: MatchSingle, output_file: str) -> dict:
question, model, answer, judge, ref_answer, multi_turn = (
match.question,
match.model,
Expand All @@ -186,6 +188,7 @@ def play_a_match_single(match: MatchSingle, output_file: str) -> dict:
answer,
judge,
ref_answer,
openai_client,
multi_turn=multi_turn,
judgment=judgment,
)
Expand Down Expand Up @@ -215,10 +218,7 @@ def play_a_match_single(match: MatchSingle, output_file: str) -> dict:
return result


def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None) -> str:
if api_dict is not None:
openai.api_base = api_dict["api_base"]
openai.api_key = api_dict["api_key"]
def chat_completion_openai(openai_client, model, conv, temperature, max_tokens) -> str:
output = API_ERROR_OUTPUT
for _ in range(API_MAX_RETRY):
try:
Expand All @@ -232,16 +232,16 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None)
messages[0]["content"] + "\n" + messages[1]["content"]
)
messages = messages[1:]
response = openai.ChatCompletion.create(
response = openai_client.chat.completions.create(
model=model,
messages=messages,
n=1,
temperature=temperature,
max_tokens=max_tokens,
)
output = response["choices"][0]["message"]["content"]
output = response.choices[0].message.content
break
except openai.error.OpenAIError as e:
except openai.OpenAIError as e:
print(type(e), e)
time.sleep(API_RETRY_SLEEP)

Expand Down
24 changes: 14 additions & 10 deletions src/instructlab/eval/mt_bench_judgment.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,15 @@ def make_judgment(
if bench_name == "mt_bench":
# Second turn
df_2 = judgment_df[judgment_df["turn"] == 2].groupby(["model", "turn"]).mean()
turn2_score = df_2["score"].iloc[0]
turn_scores.append(turn2_score)
if len(df_2.index) > 0:
turn2_score = df_2["score"].iloc[0]
turn_scores.append(turn2_score)

# Average
df_3 = judgment_df[["model", "score"]].groupby(["model"]).mean()
overall_score = df_3["score"].iloc[0]
# Average
df_3 = judgment_df[["model", "score"]].groupby(["model"]).mean()
overall_score = df_3["score"].iloc[0]
else:
turn_scores.append("N/A")

question_df = pd.read_json(question_file, lines=True)

Expand Down Expand Up @@ -128,6 +131,7 @@ def make_judgment(
def judge_model(
model_name,
judge_model_name,
openai_client,
branch=None,
bench_name="mt_bench",
output_dir="eval_output",
Expand Down Expand Up @@ -218,11 +222,11 @@ def judge_model(
# Play matches
if max_workers == 1:
for match in tqdm(matches):
play_a_match_single(match, output_file=output_file)
play_a_match_single(openai_client, match, output_file=output_file)
else:

def play_a_match_wrapper(match):
play_a_match_single(match, output_file=output_file)
play_a_match_single(openai_client, match, output_file=output_file)

np.random.seed(0)
np.random.shuffle(matches)
Expand All @@ -239,18 +243,17 @@ def play_a_match_wrapper(match):
def generate_judgment(
model_name,
judge_model_name,
model_api_base,
bench_name="mt_bench",
output_dir="eval_output",
data_dir=None,
branch=None,
model_list=None,
max_workers=1,
first_n=None,
model_api_base=None,
):
"""Generate judgment with scores and qa_pairs for a model"""
if model_api_base is not None:
openai.api_base = model_api_base
openai_client = openai.OpenAI(base_url=model_api_base, api_key="NO_API_KEY")

first_n_env = os.environ.get("INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS")
if first_n_env is not None and first_n is None:
Expand All @@ -259,6 +262,7 @@ def generate_judgment(
question_file, judgment_file, answer_file = judge_model(
model_name,
judge_model_name,
openai_client,
bench_name=bench_name,
output_dir=output_dir,
data_dir=data_dir,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_branch_judge_answers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
print(f"QA Pair 0:")
pprint.pprint(qa_pairs[0])

print(f"base_qa_pairs length: {len(qa_pairs)}")
print(f"qa_pairs length: {len(qa_pairs)}")

for qa_pair in qa_pairs:
question_id = qa_pair.get("question_id")
Expand Down
2 changes: 1 addition & 1 deletion tests/test_judge_answers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
print(f"QA Pair 0:")
pprint.pprint(qa_pairs[0])

print(f"base_qa_pairs length: {len(qa_pairs)}")
print(f"qa_pairs length: {len(qa_pairs)}")

for qa_pair in qa_pairs:
assert qa_pair.get("question_id") is not None
Expand Down

0 comments on commit d45fc4d

Please sign in to comment.