Skip to content

Commit

Permalink
Working toward a functional state
Browse files Browse the repository at this point in the history
Signed-off-by: Dan McPherson <[email protected]>
  • Loading branch information
danmcp committed Jun 18, 2024
1 parent ad020ef commit fbcd6d3
Show file tree
Hide file tree
Showing 11 changed files with 39 additions and 310 deletions.
80 changes: 0 additions & 80 deletions data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl

This file was deleted.

160 changes: 0 additions & 160 deletions data/mt_bench/model_judgment/gpt-4_single.jsonl

This file was deleted.

44 changes: 8 additions & 36 deletions src/instructlab/eval/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,11 +232,10 @@ def play_a_match_single(match: MatchSingle, output_file: str, do_batch: bool=Fal
match.ref_answer,
match.multi_turn,
)
create_batch = False
if do_batch:
batch_output_file = output_file.replace(".jsonl", "-batch-output.jsonl")
if os.path.isfile(batch_output_file):
create_batch = False
else:
if not os.path.isfile(batch_output_file):
create_batch = True

if judge.prompt_template["type"] == "single":
Expand Down Expand Up @@ -432,10 +431,10 @@ def play_a_match_pair(match: MatchPair, output_file: str):
)
elif judge.prompt_template["type"] == "single":
m1_score, m1_user_prompt, m1_judgment = run_judge_single(
question, answer_1, judge
question, answer_1, judge, ref_answer
)
m2_score, m2_user_prompt, m2_judgment = run_judge_single(
question, answer_2, judge
question, answer_2, judge, ref_answer
)

if abs(m1_score - m2_score) <= TIE_DELTA:
Expand Down Expand Up @@ -471,7 +470,7 @@ def play_a_match_pair(match: MatchPair, output_file: str):

if output_file:
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "a") as fout:
with open(output_file, "a", encoding="utf-8") as fout:
fout.write(json.dumps(result) + "\n")

return result
Expand Down Expand Up @@ -532,12 +531,12 @@ def chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict=
)
output = response["choices"][0]["message"]["content"]
break
except openai.error.OpenAIError as e:
print(type(e), e)
time.sleep(API_RETRY_SLEEP)
except openai.error.InvalidRequestError as e:
print(type(e), e)
break
except openai.error.OpenAIError as e:
print(type(e), e)
time.sleep(API_RETRY_SLEEP)
except KeyError:
print(response)
break
Expand Down Expand Up @@ -571,32 +570,6 @@ def chat_completion_anthropic(model, conv, temperature, max_tokens, api_dict=Non
return output.strip()


def chat_completion_palm(chat_state, model, conv, temperature, max_tokens):
from fastchat.serve.api_provider import init_palm_chat

assert model == "palm-2-chat-bison-001"

if chat_state is None:
chat_state = init_palm_chat("chat-bison@001")

parameters = {
"temperature": temperature,
"top_p": 0.8,
"top_k": 40,
"max_output_tokens": max_tokens,
}
output = API_ERROR_OUTPUT
for _ in range(API_MAX_RETRY):
try:
response = chat_state.send_message(conv.messages[-2][1], **parameters)
output = response.text
break
except Exception as e:
print(type(e), e)
time.sleep(API_RETRY_SLEEP)
return chat_state, output


def normalize_game_key_single(gamekey, result):
"""Make the model names sorted in a game key."""
qid, model_1, model_2 = gamekey
Expand Down Expand Up @@ -778,7 +751,6 @@ def check_data(questions, model_answers, ref_answers, models, judges):
for q in questions:
if q["category"] not in NEED_REF_CATS:
continue
print(f"ALI: ref_answers is: {ref_answers}")
assert (
q["question_id"] in ref_answers[jg.model_name]
), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
Expand Down
2 changes: 1 addition & 1 deletion src/instructlab/eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ class Evaluator:
"""
Parent class for Evaluators
Atttributes:
Attributes:
model_path Path to the model to be evaluated
"""

Expand Down
5 changes: 0 additions & 5 deletions src/instructlab/eval/gen_api_answer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
temperature_config,
chat_completion_openai,
chat_completion_anthropic,
chat_completion_palm,
ANTHROPIC_MODEL_LIST,
)
#TODO need to look into this dependency
Expand Down Expand Up @@ -65,10 +64,6 @@ def get_answer(

if model in ANTHROPIC_MODEL_LIST:
output = chat_completion_anthropic(model, conv, temperature, max_tokens)
elif model == "palm-2-chat-bison-001":
chat_state, output = chat_completion_palm(
chat_state, model, conv, temperature, max_tokens
)
else:
output = chat_completion_openai(model, conv, temperature, max_tokens)

Expand Down
9 changes: 7 additions & 2 deletions src/instructlab/eval/gen_judgment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
from concurrent.futures import ThreadPoolExecutor
import json

import openai
import numpy as np
from tqdm import tqdm

from instructlab.eval.common import (
from .common import (
load_questions,
load_model_answers,
load_judge_prompts,
Expand Down Expand Up @@ -175,7 +176,11 @@ def run(
parallel=1,
first_n=None,
yes=True,
batch=True):
batch=False,
openai_api_base=None):

if openai_api_base is not None:
openai.api_base = openai_api_base

question_file = f"data/{bench_name}/question.jsonl"
answer_dir = f"data/{bench_name}/model_answer/instructlab"
Expand Down
13 changes: 6 additions & 7 deletions src/instructlab/eval/mtbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Local
from .evaluator import Evaluator
import instructlab.eval.gen_api_answer as gen_api_answer
import instructlab.eval.gen_judgement as gen_judgement
import instructlab.eval.gen_judgment as gen_judgment


class MT_Bench_Evaluator(Evaluator):
Expand All @@ -18,24 +18,23 @@ class MT_Bench_Evaluator(Evaluator):
def __init__(self, server_url: str) -> None:
self.server_url = server_url

def gen_answers(self, answer_file, server_url) -> str:
def gen_answers(self, server_url) -> str:
""" Asks questions to model, returns path to answers"""
os.environ['OPENAI_API_KEY'] = "NO_API_KEY"
gen_api_answer.run(answer_file=answer_file, model_name="instructlab/granite-7b-lab", openai_api_base=server_url)
return answer_file
gen_api_answer.run(model_name="instructlab/granite-7b-lab", openai_api_base=server_url)

#def judge_answers(self, judge_endpoint) -> tuple:
def judge_answers(self, judge_endpoint) -> str:
"""
Runs MT-Bench judgement
Runs MT-Bench judgment
Returns:
overall_score MT-Bench score for the overall model evaluation
qa_pairs Question and answer pairs from the evaluation
"""
os.environ['OPENAI_API_BASE'] = judge_endpoint
#os.environ['OPENAI_API_BASE'] = judge_endpoint
os.environ['OPENAI_API_KEY'] = "NO_API_KEY"
output_file = gen_judgement.run(parallel=40)
output_file = gen_judgment.run(parallel=40, openai_api_base=judge_endpoint)
return output_file


Expand Down
4 changes: 2 additions & 2 deletions src/instructlab/eval/show_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ def display_result_pairwise(args):
choices=["pairwise-baseline", "pairwise-all", "single"],
help=(
"Evaluation mode. "
"`pairwise-baseline` runs pairwise comparision against a baseline. "
"`pairwise-all` runs pairwise comparision between all pairs. "
"`pairwise-baseline` runs pairwise comparison against a baseline. "
"`pairwise-all` runs pairwise comparison between all pairs. "
"`single` runs single answer grading."
),
)
Expand Down
26 changes: 11 additions & 15 deletions steps_mt_bench.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,21 @@
# MT-Bench Broken Down in Eval Scripts (not PR Bench)

### From justfile: `run_bench`
## TODO Figure out the right version. Latest fails with openai.types not found

pip install vllm==0.3.3

You should run with `--tensor-parallel-size <NUM GPUS>` and possibly increase `--max-model-len` to increase the context length

If dry run:
```shell
OPENAI_API_KEY="NO_API_KEY" python gen_api_answer.py \
--bench-name mt_bench \
--openai-api-base http://localhost:8000/v1 \
--model granite-7b-lab \
--num-choices 1 \
--question-begin 2 \
--question-end 4
python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab
```

## From justfile: `run_bench`

If not dry run

```shell
OPENAI_API_KEY="NO_API_KEY" python gen_api_answer.py \
--bench-name mt_bench \
--openai-api-base http://localhost:8000/v1 \
--model granite-7b-lab \
--num-choices 1
OPENAI_API_KEY="NO_API_KEY" python3 test_gen_answers.py
```

results are in data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl
Expand All @@ -35,7 +31,7 @@ python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab
```

```shell
OPENAI_API_BASE=http://0.0.0.0:8000/v1 OPENAI_API_KEY="NO_API_KEY" python src/instructlab/eval/gen_judgment.py --bench-name mt_bench --parallel 40 --yes
OPENAI_API_KEY="NO_API_KEY" python3 test_judge_answers.py
```

results are in data/mt_bench/model_judgment/gpt-4_single.jsonl
Expand Down
4 changes: 4 additions & 0 deletions test_gen_answers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from instructlab.eval.mtbench import MT_Bench_Evaluator

mt_bench = MT_Bench_Evaluator(server_url="http://localhost:8000")
mt_bench.gen_answers("http://localhost:8000/v1")
2 changes: 0 additions & 2 deletions test_mt_bench.py → test_judge_answers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from instructlab.eval.mtbench import MT_Bench_Evaluator

mt_bench = MT_Bench_Evaluator(server_url="http://localhost:8000")
#path = mt_bench.gen_answers("test-answers.jsonl", "http://localhost:8000/v1")
#print(path)
output_file = mt_bench.judge_answers("http://localhost:8000/v1")
print(output_file)

0 comments on commit fbcd6d3

Please sign in to comment.