Working toward a functional state

Signed-off-by: Dan McPherson <[email protected]>
instructlab · Jun 21, 2024 · 3ab7e72 · 3ab7e72
1 parent ad020ef
commit 3ab7e72
Show file tree

Hide file tree

Showing 25 changed files with 769 additions and 1,385 deletions.
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
@@ -2,5 +2,13 @@
 # make spellcheck-sort
 # Please keep this file sorted:
 # SPDX-License-Identifier: Apache-2.0
-eval
 Tatsu
+TODO
+eval
+gpt
+instructlab
+jsonl
+justfile
+openai
+venv
+vllm
diff --git a/README.md b/README.md
@@ -5,4 +5,90 @@
 ![Release](https://img.shields.io/github/v/release/instructlab/eval)
 ![License](https://img.shields.io/github/license/instructlab/eval)
 
-Python library for Evaluation
+Python Library for Evaluation
+
+## MT-Bench / PR-Bench Testing Steps
+
+TODO: Figure out the right version.  Latest fails with openai.types not found in the same venv.
+
+```shell
+pip install vllm==0.3.3
+```
+
+You should run with `--tensor-parallel-size <NUM GPUS>` and possibly increase `--max-model-len` to increase the context length
+
+```shell
+python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --tensor-parallel-size 4
+```
+
+```shell
+OPENAI_API_KEY="NO_API_KEY" python3 tests/test_gen_answers.py
+OPENAI_API_KEY="NO_API_KEY" python3 tests/test_pr_bench_gen_answers.py
+```
+
+Example output
+
+```
+eval_output/
+├── mt_bench
+│   └── model_answer
+│       └── instructlab
+│           └── granite-7b-lab.jsonl
+└── pr_bench
+    ├── main
+    │   ├── model_answer
+    │   │   └── instructlab
+    │   │       └── granite-7b-lab.jsonl
+    │   ├── question.jsonl
+    │   └── reference_answer
+    │       └── gpt-4.jsonl
+    └── rc
+        ├── model_answer
+        │   └── instructlab
+        │       └── granite-7b-lab.jsonl
+        ├── question.jsonl
+        └── reference_answer
+            └── gpt-4.jsonl
+```
+
+For running judge model with vllm make sure you run with `--served-model-name gpt-4`
+
+You should run with `--tensor-parallel-size <NUM GPUS>` and possibly increase `--max-model-len` to increase the context length
+
+```shell
+python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --served-model-name gpt-4 --tensor-parallel-size 4
+```
+
+```shell
+OPENAI_API_KEY="NO_API_KEY" python3 tests/test_judge_answers.py
+OPENAI_API_KEY="NO_API_KEY" python3 tests/test_pr_bench_judge_answers.py
+```
+
+Example output
+
+```
+eval_output/
+├── mt_bench
+│   └── model_answer
+│       └── instructlab
+│           └── granite-7b-lab.jsonl
+└── pr_bench
+    ├── main
+    │   ├── model_answer
+    │   │   └── instructlab
+    │   │       └── granite-7b-lab.jsonl
+    │   ├── model_judgment
+    │   │   └── gpt-4_single.jsonl
+    │   ├── question.jsonl
+    │   └── reference_answer
+    │       └── gpt-4.jsonl
+    └── rc
+        ├── model_answer
+        │   └── instructlab
+        │       └── granite-7b-lab.jsonl
+        ├── model_judgment
+        │   └── gpt-4_single.jsonl
+        ├── question.jsonl
+        └── reference_answer
+            └── gpt-4.jsonl
+```
diff --git a/data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl b/data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl
diff --git a/data/mt_bench/model_judgment/gpt-4_single.jsonl b/data/mt_bench/model_judgment/gpt-4_single.jsonl
diff --git a/requirements.txt b/requirements.txt
@@ -2,9 +2,9 @@
 FastChat
 shortuuid
 openai<1.0.0
-anthropic
 psutil
 torch
 transformers
 accelerate
 pandas
+pandas-stubs
diff --git a/src/instructlab/eval/gen_api_answer.py → src/instructlab/eval/answers.py b/src/instructlab/eval/gen_api_answer.py → src/instructlab/eval/answers.py
@@ -1,49 +1,43 @@
-"""Generate answers with GPT-4
-
-Usage:
-python3 gen_api_answer.py --model gpt-3.5-turbo
-"""
-import argparse
+# Standard
+import concurrent.futures
 import json
 import os
 import time
-import concurrent.futures
 
+# Third Party
+# TODO need to look into this dependency
+from fastchat.model.model_adapter import get_conversation_template  # type: ignore
 import openai
 import shortuuid
 import tqdm
 
-from .common import (
-    load_questions,
-    temperature_config,
-    chat_completion_openai,
-    chat_completion_anthropic,
-    chat_completion_palm,
-    ANTHROPIC_MODEL_LIST,
-)
-#TODO need to look into this dependency
-from fastchat.model.model_adapter import get_conversation_template
+# Local
+from .common import chat_completion_openai, load_questions, temperature_config
+
 
 def reorg_answer_file(answer_file):
     """Sort by question id and de-duplication"""
     answers = {}
-    with open(answer_file, "r") as fin:
+    with open(answer_file, "r", encoding="utf-8") as fin:
         for l in fin:
             qid = json.loads(l)["question_id"]
             answers[qid] = l
 
     qids = sorted(list(answers.keys()))
-    with open(answer_file, "w") as fout:
+    with open(answer_file, "w", encoding="utf-8") as fout:
         for qid in qids:
             fout.write(answers[qid])
 
 
 def get_answer(
-        question: dict, model: str, num_choices: int, max_tokens: int, answer_file: str, force_temperature: float
+    question: dict,
+    model: str,
+    num_choices: int,
+    max_tokens: int,
+    answer_file: str,
+    force_temperature: float,
 ):
-    assert (
-        force_temperature is not None and "required_temperature" in question.keys()
-    ) == False
+    assert force_temperature is None or question.get("required_temperature") is None
     if force_temperature is not None:
         temperature = force_temperature
     elif "required_temperature" in question.keys():
@@ -54,7 +48,6 @@ def get_answer(
         temperature = 0.7
 
     choices = []
-    chat_state = None  # for palm-2 model
     for i in range(num_choices):
         conv = get_conversation_template(model)
 
@@ -63,14 +56,7 @@ def get_answer(
             conv.append_message(conv.roles[0], question["turns"][j])
             conv.append_message(conv.roles[1], None)
 
-            if model in ANTHROPIC_MODEL_LIST:
-                output = chat_completion_anthropic(model, conv, temperature, max_tokens)
-            elif model == "palm-2-chat-bison-001":
-                chat_state, output = chat_completion_palm(
-                    chat_state, model, conv, temperature, max_tokens
-                )
-            else:
-                output = chat_completion_openai(model, conv, temperature, max_tokens)
+            output = chat_completion_openai(model, conv, temperature, max_tokens)
 
             conv.update_last_message(output)
             turns.append(output)
@@ -87,31 +73,45 @@ def get_answer(
     }
 
     os.makedirs(os.path.dirname(answer_file), exist_ok=True)
-    with open(answer_file, "a") as fout:
+    with open(answer_file, "a", encoding="utf-8") as fout:
         fout.write(json.dumps(ans) + "\n")
 
-def run(
-        question_begin=None,
-        question_end=None,
-        force_temperature=None,
-        answer_file=None,
-        model_name="gpt-3.5-turbo",
-        num_choices=1,
-        max_tokens=1024,
-        parallel=1,
-        openai_api_base=None):
-
-    if openai_api_base is not None:
-        openai.api_base = openai_api_base
-
-    question_file = f"data/mt_bench/question.jsonl"
+
+def generate_answers(
+    model_name,
+    branch=None,
+    output_dir="eval_output",
+    data_dir=None,
+    question_begin=None,
+    question_end=None,
+    force_temperature=None,
+    answer_file=None,
+    num_choices=1,
+    max_tokens=1024,
+    max_workers=1,
+    model_api_base=None,
+    bench_name="mt_bench",
+):
+    if model_api_base is not None:
+        openai.api_base = model_api_base
+
+    if data_dir is None:
+        data_dir = os.path.join(os.path.dirname(__file__), "data")
+
+    data_base_dir = f"{data_dir}/{bench_name}"
+    output_base_dir = f"{output_dir}/{bench_name}"
+    if branch is not None:
+        data_base_dir = os.path.join(data_base_dir, branch)
+        output_base_dir = os.path.join(output_base_dir, branch)
+
+    question_file = f"{data_base_dir}/question.jsonl"
     questions = load_questions(question_file, question_begin, question_end)
 
     if not answer_file:
-        answer_file = f"data/mt_bench/model_answer/{model_name}.jsonl"
+        answer_file = f"{output_base_dir}/model_answer/{model_name}.jsonl"
     print(f"Output to {answer_file}")
 
-    with concurrent.futures.ThreadPoolExecutor(max_workers=parallel) as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
         futures = []
         for question in questions:
             future = executor.submit(