Skip to content

Commit

Permalink
Working toward a functional state
Browse files Browse the repository at this point in the history
Signed-off-by: Dan McPherson <[email protected]>
  • Loading branch information
danmcp committed Jun 22, 2024
1 parent ad020ef commit 9882a55
Show file tree
Hide file tree
Showing 25 changed files with 878 additions and 1,479 deletions.
10 changes: 9 additions & 1 deletion .spellcheck-en-custom.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,13 @@
# make spellcheck-sort
# Please keep this file sorted:
# SPDX-License-Identifier: Apache-2.0
eval
Tatsu
TODO
eval
gpt
instructlab
jsonl
justfile
openai
venv
vllm
91 changes: 90 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,93 @@
![Release](https://img.shields.io/github/v/release/instructlab/eval)
![License](https://img.shields.io/github/license/instructlab/eval)

Python library for Evaluation
Python Library for Evaluation

## MT-Bench / PR-Bench Testing Steps

```shell
git clone https://github.com/instructlab/taxonomy.git && pushd taxonomy && git branch rc && popd
git clone --bare https://github.com/instructlab/eval.git && git clone eval.git/ && cd eval && git remote add syncrepo ../eval.git
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
pip install -r requirements-dev.txt
pip install -e .
pip install vllm==0.3.3
python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --tensor-parallel-size 4
```

In another shell window

```shell
OPENAI_API_KEY="NO_API_KEY" python3 tests/test_gen_answers.py
OPENAI_API_KEY="NO_API_KEY" python3 tests/test_pr_bench_gen_answers.py
```

Example output tree

```shell
eval_output/
├── mt_bench
│   └── model_answer
│   └── instructlab
│   └── granite-7b-lab.jsonl
└── pr_bench
├── main
│   ├── model_answer
│   │   └── instructlab
│   │   └── granite-7b-lab.jsonl
│   ├── question.jsonl
│   └── reference_answer
│   └── instructlab
│   └── granite-7b-lab.jsonl
└── rc
├── model_answer
│   └── instructlab
│   └── granite-7b-lab.jsonl
├── question.jsonl
└── reference_answer
└── instructlab
└── granite-7b-lab.jsonl
```

```shell
OPENAI_API_KEY="NO_API_KEY" python3 tests/test_judge_answers.py
OPENAI_API_KEY="NO_API_KEY" python3 tests/test_pr_bench_judge_answers.py
```

Example output tree

```shell
eval_output/
├── mt_bench
│   ├── model_answer
│   │   └── instructlab
│   │   └── granite-7b-lab.jsonl
│   └── model_judgment
│   └── instructlab
│   └── granite-7b-lab_single.jsonl
└── pr_bench
├── main
│   ├── model_answer
│   │   └── instructlab
│   │   └── granite-7b-lab.jsonl
│   ├── model_judgment
│   │   └── instructlab
│   │   └── granite-7b-lab_single.jsonl
│   ├── question.jsonl
│   └── reference_answer
│   └── instructlab
│   └── granite-7b-lab.jsonl
└── rc
├── model_answer
│   └── instructlab
│   └── granite-7b-lab.jsonl
├── model_judgment
│   └── instructlab
│   └── granite-7b-lab_single.jsonl
├── question.jsonl
└── reference_answer
└── instructlab
└── granite-7b-lab.jsonl
```
80 changes: 0 additions & 80 deletions data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl

This file was deleted.

160 changes: 0 additions & 160 deletions data/mt_bench/model_judgment/gpt-4_single.jsonl

This file was deleted.

80 changes: 0 additions & 80 deletions data/mt_bench/reference_answer/gpt-4-turbo.jsonl

This file was deleted.

3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
FastChat
GitPython>=3.1.42,<4.0.0
shortuuid
openai<1.0.0
anthropic
psutil
torch
transformers
accelerate
pandas
pandas-stubs
Original file line number Diff line number Diff line change
@@ -1,49 +1,49 @@
"""Generate answers with GPT-4
Usage:
python3 gen_api_answer.py --model gpt-3.5-turbo
"""
import argparse
# Standard
import concurrent.futures
import json
import os
import time
import concurrent.futures

# Third Party
# TODO need to look into this dependency
from fastchat.model.model_adapter import get_conversation_template # type: ignore
import openai
import shortuuid
import tqdm

# Local
from .common import (
bench_dir,
chat_completion_openai,
load_questions,
temperature_config,
chat_completion_openai,
chat_completion_anthropic,
chat_completion_palm,
ANTHROPIC_MODEL_LIST,
)
#TODO need to look into this dependency
from fastchat.model.model_adapter import get_conversation_template


def reorg_answer_file(answer_file):
"""Sort by question id and de-duplication"""
answers = {}
with open(answer_file, "r") as fin:
with open(answer_file, "r", encoding="utf-8") as fin:
for l in fin:
qid = json.loads(l)["question_id"]
answers[qid] = l

qids = sorted(list(answers.keys()))
with open(answer_file, "w") as fout:
with open(answer_file, "w", encoding="utf-8") as fout:
for qid in qids:
fout.write(answers[qid])


def get_answer(
question: dict, model: str, num_choices: int, max_tokens: int, answer_file: str, force_temperature: float
question: dict,
model: str,
num_choices: int,
max_tokens: int,
answer_file: str,
force_temperature: float,
):
assert (
force_temperature is not None and "required_temperature" in question.keys()
) == False
"""Answer a question with the model"""
assert force_temperature is None or question.get("required_temperature") is None
if force_temperature is not None:
temperature = force_temperature
elif "required_temperature" in question.keys():
Expand All @@ -54,7 +54,6 @@ def get_answer(
temperature = 0.7

choices = []
chat_state = None # for palm-2 model
for i in range(num_choices):
conv = get_conversation_template(model)

Expand All @@ -63,14 +62,7 @@ def get_answer(
conv.append_message(conv.roles[0], question["turns"][j])
conv.append_message(conv.roles[1], None)

if model in ANTHROPIC_MODEL_LIST:
output = chat_completion_anthropic(model, conv, temperature, max_tokens)
elif model == "palm-2-chat-bison-001":
chat_state, output = chat_completion_palm(
chat_state, model, conv, temperature, max_tokens
)
else:
output = chat_completion_openai(model, conv, temperature, max_tokens)
output = chat_completion_openai(model, conv, temperature, max_tokens)

conv.update_last_message(output)
turns.append(output)
Expand All @@ -87,31 +79,43 @@ def get_answer(
}

os.makedirs(os.path.dirname(answer_file), exist_ok=True)
with open(answer_file, "a") as fout:
with open(answer_file, "a", encoding="utf-8") as fout:
fout.write(json.dumps(ans) + "\n")

def run(
question_begin=None,
question_end=None,
force_temperature=None,
answer_file=None,
model_name="gpt-3.5-turbo",
num_choices=1,
max_tokens=1024,
parallel=1,
openai_api_base=None):

if openai_api_base is not None:
openai.api_base = openai_api_base

question_file = f"data/mt_bench/question.jsonl"

def generate_answers(
model_name,
branch=None,
output_dir="eval_output",
data_dir=None,
question_begin=None,
question_end=None,
force_temperature=None,
num_choices=1,
max_tokens=1024,
max_workers=1,
model_api_base=None,
bench_name="mt_bench",
):
"""Generate model answers to be judged"""
if model_api_base is not None:
openai.api_base = model_api_base

if data_dir is None:
data_dir = os.path.join(os.path.dirname(__file__), "data")

data_base_dir = bench_dir(data_dir, bench_name, branch)
output_base_dir = bench_dir(output_dir, bench_name, branch)

question_file = f"{data_base_dir}/question.jsonl"
questions = load_questions(question_file, question_begin, question_end)

if not answer_file:
answer_file = f"data/mt_bench/model_answer/{model_name}.jsonl"
print(f"Output to {answer_file}")
answer_file = f"{output_base_dir}/model_answer/{model_name}.jsonl"
if os.path.isfile(answer_file):
os.remove(answer_file)
# print(f"Output to {answer_file}")

with concurrent.futures.ThreadPoolExecutor(max_workers=parallel) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for question in questions:
future = executor.submit(
Expand Down
Loading

0 comments on commit 9882a55

Please sign in to comment.