Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating to > 1.0 openai #17

Merged
merged 1 commit into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ source venv/bin/activate
pip install -r requirements.txt
pip install -r requirements-dev.txt
pip install -e .
pip install vllm==0.3.3
pip install vllm
python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --tensor-parallel-size 1
```

Expand Down Expand Up @@ -65,7 +65,7 @@ eval_output/
```

```shell
export INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS=10 # Optional if you want to shorten run times
export INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=40 # Optional if you want to shorten run times
python3 tests/test_judge_answers.py
python3 tests/test_branch_judge_answers.py
```
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
FastChat
GitPython>=3.1.42,<4.0.0
shortuuid
openai<1.0.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are there any implications to this bump? or is it a clean reversioning?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might not be understanding the questions. But there were several changes with the API that needed corresponding changes on our side that are in the commit.

openai>=1.13.3,<2.0.0
psutil
torch
transformers
Expand Down
6 changes: 0 additions & 6 deletions src/instructlab/eval/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +0,0 @@
# Standard
import os

openai_api_key = os.environ.get("OPENAI_API_KEY")
if openai_api_key is None:
os.environ["OPENAI_API_KEY"] = "NO_API_KEY"
3 changes: 1 addition & 2 deletions src/instructlab/eval/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ def run(self) -> tuple:

individual_scores: dict = {}
agg_score: float = 0.0
model_args = f"pretrained= {self.model_path}, dtype= {self.model_dtype}"

model_args = f"pretrained={self.model_path},dtype={self.model_dtype}"
Copy link
Member Author

@danmcp danmcp Jun 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alimaredia @alinaryan @JamesKunstle FYI in case you hit this issue as well.

mmlu_output = simple_evaluate(
model="hf",
model_args=model_args,
Expand Down
8 changes: 4 additions & 4 deletions src/instructlab/eval/mt_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def gen_answers(self, server_url) -> None:
"""
mt_bench_answers.generate_answers(
self.model_name,
model_api_base=server_url,
server_url,
output_dir=self.output_dir,
max_workers=self.max_workers,
)
Expand All @@ -63,8 +63,8 @@ def judge_answers(self, server_url) -> tuple:
return mt_bench_judgment.generate_judgment(
self.model_name,
self.judge_model_name,
server_url,
max_workers=self.max_workers,
model_api_base=server_url,
output_dir=self.output_dir,
)

Expand Down Expand Up @@ -113,8 +113,8 @@ def gen_answers(self, server_url) -> None:
)
mt_bench_answers.generate_answers(
self.model_name,
server_url,
branch=self.branch,
model_api_base=server_url,
output_dir=self.output_dir,
data_dir=self.output_dir,
max_workers=self.max_workers,
Expand All @@ -134,9 +134,9 @@ def judge_answers(self, server_url) -> tuple:
_, qa_pairs, _ = mt_bench_judgment.generate_judgment(
self.model_name,
self.judge_model_name,
server_url,
branch=self.branch,
max_workers=self.max_workers,
model_api_base=server_url,
output_dir=self.output_dir,
data_dir=self.output_dir,
bench_name="mt_bench_branch",
Expand Down
11 changes: 7 additions & 4 deletions src/instructlab/eval/mt_bench_answers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def get_answer(
max_tokens: int,
answer_file: str,
force_temperature: float,
openai_client,
):
"""Answer a question with the model"""
assert force_temperature is None or question.get("required_temperature") is None
Expand All @@ -62,7 +63,9 @@ def get_answer(
conv.append_message(conv.roles[0], question["turns"][j])
conv.append_message(conv.roles[1], None)

output = chat_completion_openai(model, conv, temperature, max_tokens)
output = chat_completion_openai(
openai_client, model, conv, temperature, max_tokens
)

conv.update_last_message(output)
turns.append(output)
Expand All @@ -85,6 +88,7 @@ def get_answer(

def generate_answers(
model_name,
model_api_base,
branch=None,
output_dir="eval_output",
data_dir=None,
Expand All @@ -94,12 +98,10 @@ def generate_answers(
num_choices=1,
max_tokens=1024,
max_workers=1,
model_api_base=None,
bench_name="mt_bench",
):
"""Generate model answers to be judged"""
if model_api_base is not None:
openai.api_base = model_api_base
openai_client = openai.OpenAI(base_url=model_api_base, api_key="NO_API_KEY")

if data_dir is None:
data_dir = os.path.join(os.path.dirname(__file__), "data")
Expand All @@ -125,6 +127,7 @@ def generate_answers(
max_tokens,
answer_file,
force_temperature,
openai_client,
)
futures.append(future)

Expand Down
20 changes: 10 additions & 10 deletions src/instructlab/eval/mt_bench_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def load_judge_prompts(prompt_file: str) -> dict:


def run_judge_single(
question, answer, judge, ref_answer, multi_turn=False, judgment=None
question, answer, judge, ref_answer, openai_client, multi_turn=False, judgment=None
):
kwargs = {}
model = judge.model_name
Expand Down Expand Up @@ -150,7 +150,9 @@ def run_judge_single(
conv.append_message(conv.roles[1], None)

if judgment is None:
judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048)
judgment = chat_completion_openai(
openai_client, model, conv, temperature=0, max_tokens=2048
)

if judge.prompt_template["output_format"] == "[[rating]]":
match = re.search(one_score_pattern, judgment)
Expand All @@ -169,7 +171,7 @@ def run_judge_single(
return rating, user_prompt, judgment


def play_a_match_single(match: MatchSingle, output_file: str) -> dict:
def play_a_match_single(openai_client, match: MatchSingle, output_file: str) -> dict:
question, model, answer, judge, ref_answer, multi_turn = (
match.question,
match.model,
Expand All @@ -186,6 +188,7 @@ def play_a_match_single(match: MatchSingle, output_file: str) -> dict:
answer,
judge,
ref_answer,
openai_client,
multi_turn=multi_turn,
judgment=judgment,
)
Expand Down Expand Up @@ -215,10 +218,7 @@ def play_a_match_single(match: MatchSingle, output_file: str) -> dict:
return result


def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None) -> str:
if api_dict is not None:
openai.api_base = api_dict["api_base"]
openai.api_key = api_dict["api_key"]
def chat_completion_openai(openai_client, model, conv, temperature, max_tokens) -> str:
output = API_ERROR_OUTPUT
for _ in range(API_MAX_RETRY):
try:
Expand All @@ -232,16 +232,16 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None)
messages[0]["content"] + "\n" + messages[1]["content"]
)
messages = messages[1:]
response = openai.ChatCompletion.create(
response = openai_client.chat.completions.create(
model=model,
messages=messages,
n=1,
temperature=temperature,
max_tokens=max_tokens,
)
output = response["choices"][0]["message"]["content"]
output = response.choices[0].message.content
break
except openai.error.OpenAIError as e:
except openai.OpenAIError as e:
print(type(e), e)
time.sleep(API_RETRY_SLEEP)

Expand Down
26 changes: 15 additions & 11 deletions src/instructlab/eval/mt_bench_judgment.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,15 @@ def make_judgment(
if bench_name == "mt_bench":
# Second turn
df_2 = judgment_df[judgment_df["turn"] == 2].groupby(["model", "turn"]).mean()
turn2_score = df_2["score"].iloc[0]
turn_scores.append(turn2_score)
if len(df_2.index) > 0:
turn2_score = df_2["score"].iloc[0]
turn_scores.append(turn2_score)

# Average
df_3 = judgment_df[["model", "score"]].groupby(["model"]).mean()
overall_score = df_3["score"].iloc[0]
# Average
df_3 = judgment_df[["model", "score"]].groupby(["model"]).mean()
overall_score = df_3["score"].iloc[0]
else:
turn_scores.append("N/A")

question_df = pd.read_json(question_file, lines=True)

Expand Down Expand Up @@ -128,6 +131,7 @@ def make_judgment(
def judge_model(
model_name,
judge_model_name,
openai_client,
branch=None,
bench_name="mt_bench",
output_dir="eval_output",
Expand Down Expand Up @@ -218,11 +222,11 @@ def judge_model(
# Play matches
if max_workers == 1:
for match in tqdm(matches):
play_a_match_single(match, output_file=output_file)
play_a_match_single(openai_client, match, output_file=output_file)
else:

def play_a_match_wrapper(match):
play_a_match_single(match, output_file=output_file)
play_a_match_single(openai_client, match, output_file=output_file)

np.random.seed(0)
np.random.shuffle(matches)
Expand All @@ -239,26 +243,26 @@ def play_a_match_wrapper(match):
def generate_judgment(
model_name,
judge_model_name,
model_api_base,
bench_name="mt_bench",
output_dir="eval_output",
data_dir=None,
branch=None,
model_list=None,
max_workers=1,
first_n=None,
model_api_base=None,
):
"""Generate judgment with scores and qa_pairs for a model"""
if model_api_base is not None:
openai.api_base = model_api_base
openai_client = openai.OpenAI(base_url=model_api_base, api_key="NO_API_KEY")

first_n_env = os.environ.get("INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS")
first_n_env = os.environ.get("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS")
if first_n_env is not None and first_n is None:
first_n = int(first_n_env)

question_file, judgment_file, answer_file = judge_model(
model_name,
judge_model_name,
openai_client,
bench_name=bench_name,
output_dir=output_dir,
data_dir=data_dir,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_branch_judge_answers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
print(f"QA Pair 0:")
pprint.pprint(qa_pairs[0])

print(f"base_qa_pairs length: {len(qa_pairs)}")
print(f"qa_pairs length: {len(qa_pairs)}")

for qa_pair in qa_pairs:
question_id = qa_pair.get("question_id")
Expand Down
2 changes: 1 addition & 1 deletion tests/test_judge_answers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
print(f"QA Pair 0:")
pprint.pprint(qa_pairs[0])

print(f"base_qa_pairs length: {len(qa_pairs)}")
print(f"qa_pairs length: {len(qa_pairs)}")

for qa_pair in qa_pairs:
assert qa_pair.get("question_id") is not None
Expand Down