Skip to content

Commit

Permalink
Working toward a functional state
Browse files Browse the repository at this point in the history
Signed-off-by: Dan McPherson <[email protected]>
  • Loading branch information
danmcp committed Jun 21, 2024
1 parent ad020ef commit 3ab7e72
Show file tree
Hide file tree
Showing 25 changed files with 769 additions and 1,385 deletions.
10 changes: 9 additions & 1 deletion .spellcheck-en-custom.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,13 @@
# make spellcheck-sort
# Please keep this file sorted:
# SPDX-License-Identifier: Apache-2.0
eval
Tatsu
TODO
eval
gpt
instructlab
jsonl
justfile
openai
venv
vllm
88 changes: 87 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,90 @@
![Release](https://img.shields.io/github/v/release/instructlab/eval)
![License](https://img.shields.io/github/license/instructlab/eval)

Python library for Evaluation
Python Library for Evaluation

## MT-Bench / PR-Bench Testing Steps

TODO: Figure out the right version. Latest fails with openai.types not found in the same venv.

```shell
pip install vllm==0.3.3
```

You should run with `--tensor-parallel-size <NUM GPUS>` and possibly increase `--max-model-len` to increase the context length

```shell
python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --tensor-parallel-size 4
```

```shell
OPENAI_API_KEY="NO_API_KEY" python3 tests/test_gen_answers.py
OPENAI_API_KEY="NO_API_KEY" python3 tests/test_pr_bench_gen_answers.py
```

Example output

```

Check failure on line 31 in README.md

View workflow job for this annotation

GitHub Actions / markdown-lint

Fenced code blocks should have a language specified

README.md:31 MD040/fenced-code-language Fenced code blocks should have a language specified [Context: "```"] https://github.com/DavidAnson/markdownlint/blob/v0.34.0/doc/md040.md
eval_output/
├── mt_bench
│   └── model_answer
│   └── instructlab
│   └── granite-7b-lab.jsonl
└── pr_bench
├── main
│   ├── model_answer
│   │   └── instructlab
│   │   └── granite-7b-lab.jsonl
│   ├── question.jsonl
│   └── reference_answer
│   └── gpt-4.jsonl
└── rc
├── model_answer
│   └── instructlab
│   └── granite-7b-lab.jsonl
├── question.jsonl
└── reference_answer
└── gpt-4.jsonl
```

For running judge model with vllm make sure you run with `--served-model-name gpt-4`

You should run with `--tensor-parallel-size <NUM GPUS>` and possibly increase `--max-model-len` to increase the context length

```shell
python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --served-model-name gpt-4 --tensor-parallel-size 4
```

```shell
OPENAI_API_KEY="NO_API_KEY" python3 tests/test_judge_answers.py
OPENAI_API_KEY="NO_API_KEY" python3 tests/test_pr_bench_judge_answers.py
```

Example output

```

Check failure on line 69 in README.md

View workflow job for this annotation

GitHub Actions / markdown-lint

Fenced code blocks should have a language specified

README.md:69 MD040/fenced-code-language Fenced code blocks should have a language specified [Context: "```"] https://github.com/DavidAnson/markdownlint/blob/v0.34.0/doc/md040.md
eval_output/
├── mt_bench
│   └── model_answer
│   └── instructlab
│   └── granite-7b-lab.jsonl
└── pr_bench
├── main
│   ├── model_answer
│   │   └── instructlab
│   │   └── granite-7b-lab.jsonl
│   ├── model_judgment
│   │   └── gpt-4_single.jsonl
│   ├── question.jsonl
│   └── reference_answer
│   └── gpt-4.jsonl
└── rc
├── model_answer
│   └── instructlab
│   └── granite-7b-lab.jsonl
├── model_judgment
│   └── gpt-4_single.jsonl
├── question.jsonl
└── reference_answer
└── gpt-4.jsonl
```
80 changes: 0 additions & 80 deletions data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl

This file was deleted.

160 changes: 0 additions & 160 deletions data/mt_bench/model_judgment/gpt-4_single.jsonl

This file was deleted.

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
FastChat
shortuuid
openai<1.0.0
anthropic
psutil
torch
transformers
accelerate
pandas
pandas-stubs
Original file line number Diff line number Diff line change
@@ -1,49 +1,43 @@
"""Generate answers with GPT-4
Usage:
python3 gen_api_answer.py --model gpt-3.5-turbo
"""
import argparse
# Standard
import concurrent.futures
import json
import os
import time
import concurrent.futures

# Third Party
# TODO need to look into this dependency
from fastchat.model.model_adapter import get_conversation_template # type: ignore
import openai
import shortuuid
import tqdm

from .common import (
load_questions,
temperature_config,
chat_completion_openai,
chat_completion_anthropic,
chat_completion_palm,
ANTHROPIC_MODEL_LIST,
)
#TODO need to look into this dependency
from fastchat.model.model_adapter import get_conversation_template
# Local
from .common import chat_completion_openai, load_questions, temperature_config


def reorg_answer_file(answer_file):
"""Sort by question id and de-duplication"""
answers = {}
with open(answer_file, "r") as fin:
with open(answer_file, "r", encoding="utf-8") as fin:
for l in fin:
qid = json.loads(l)["question_id"]
answers[qid] = l

qids = sorted(list(answers.keys()))
with open(answer_file, "w") as fout:
with open(answer_file, "w", encoding="utf-8") as fout:
for qid in qids:
fout.write(answers[qid])


def get_answer(
question: dict, model: str, num_choices: int, max_tokens: int, answer_file: str, force_temperature: float
question: dict,
model: str,
num_choices: int,
max_tokens: int,
answer_file: str,
force_temperature: float,
):
assert (
force_temperature is not None and "required_temperature" in question.keys()
) == False
assert force_temperature is None or question.get("required_temperature") is None
if force_temperature is not None:
temperature = force_temperature
elif "required_temperature" in question.keys():
Expand All @@ -54,7 +48,6 @@ def get_answer(
temperature = 0.7

choices = []
chat_state = None # for palm-2 model
for i in range(num_choices):
conv = get_conversation_template(model)

Expand All @@ -63,14 +56,7 @@ def get_answer(
conv.append_message(conv.roles[0], question["turns"][j])
conv.append_message(conv.roles[1], None)

if model in ANTHROPIC_MODEL_LIST:
output = chat_completion_anthropic(model, conv, temperature, max_tokens)
elif model == "palm-2-chat-bison-001":
chat_state, output = chat_completion_palm(
chat_state, model, conv, temperature, max_tokens
)
else:
output = chat_completion_openai(model, conv, temperature, max_tokens)
output = chat_completion_openai(model, conv, temperature, max_tokens)

conv.update_last_message(output)
turns.append(output)
Expand All @@ -87,31 +73,45 @@ def get_answer(
}

os.makedirs(os.path.dirname(answer_file), exist_ok=True)
with open(answer_file, "a") as fout:
with open(answer_file, "a", encoding="utf-8") as fout:
fout.write(json.dumps(ans) + "\n")

def run(
question_begin=None,
question_end=None,
force_temperature=None,
answer_file=None,
model_name="gpt-3.5-turbo",
num_choices=1,
max_tokens=1024,
parallel=1,
openai_api_base=None):

if openai_api_base is not None:
openai.api_base = openai_api_base

question_file = f"data/mt_bench/question.jsonl"

def generate_answers(
model_name,
branch=None,
output_dir="eval_output",
data_dir=None,
question_begin=None,
question_end=None,
force_temperature=None,
answer_file=None,
num_choices=1,
max_tokens=1024,
max_workers=1,
model_api_base=None,
bench_name="mt_bench",
):
if model_api_base is not None:
openai.api_base = model_api_base

if data_dir is None:
data_dir = os.path.join(os.path.dirname(__file__), "data")

data_base_dir = f"{data_dir}/{bench_name}"
output_base_dir = f"{output_dir}/{bench_name}"
if branch is not None:
data_base_dir = os.path.join(data_base_dir, branch)
output_base_dir = os.path.join(output_base_dir, branch)

question_file = f"{data_base_dir}/question.jsonl"
questions = load_questions(question_file, question_begin, question_end)

if not answer_file:
answer_file = f"data/mt_bench/model_answer/{model_name}.jsonl"
answer_file = f"{output_base_dir}/model_answer/{model_name}.jsonl"
print(f"Output to {answer_file}")

with concurrent.futures.ThreadPoolExecutor(max_workers=parallel) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for question in questions:
future = executor.submit(
Expand Down
Loading

0 comments on commit 3ab7e72

Please sign in to comment.