Working toward a functional state

Signed-off-by: Dan McPherson <[email protected]>
instructlab · Jun 22, 2024 · 9882a55 · 9882a55
1 parent ad020ef
commit 9882a55
Show file tree

Hide file tree

Showing 25 changed files with 878 additions and 1,479 deletions.
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
@@ -2,5 +2,13 @@
 # make spellcheck-sort
 # Please keep this file sorted:
 # SPDX-License-Identifier: Apache-2.0
-eval
 Tatsu
+TODO
+eval
+gpt
+instructlab
+jsonl
+justfile
+openai
+venv
+vllm
diff --git a/README.md b/README.md
@@ -5,4 +5,93 @@
 ![Release](https://img.shields.io/github/v/release/instructlab/eval)
 ![License](https://img.shields.io/github/license/instructlab/eval)
 
-Python library for Evaluation
+Python Library for Evaluation
+
+## MT-Bench / PR-Bench Testing Steps
+
+```shell
+git clone https://github.com/instructlab/taxonomy.git && pushd taxonomy && git branch rc && popd
+git clone --bare https://github.com/instructlab/eval.git && git clone eval.git/ && cd eval && git remote add syncrepo ../eval.git
+python -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+pip install -r requirements-dev.txt
+pip install -e .
+pip install vllm==0.3.3
+python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --tensor-parallel-size 4
+```
+
+In another shell window
+
+```shell
+OPENAI_API_KEY="NO_API_KEY" python3 tests/test_gen_answers.py
+OPENAI_API_KEY="NO_API_KEY" python3 tests/test_pr_bench_gen_answers.py
+```
+
+Example output tree
+
+```shell
+eval_output/
+├── mt_bench
+│   └── model_answer
+│       └── instructlab
+│           └── granite-7b-lab.jsonl
+└── pr_bench
+    ├── main
+    │   ├── model_answer
+    │   │   └── instructlab
+    │   │       └── granite-7b-lab.jsonl
+    │   ├── question.jsonl
+    │   └── reference_answer
+    │       └── instructlab
+    │           └── granite-7b-lab.jsonl
+    └── rc
+        ├── model_answer
+        │   └── instructlab
+        │       └── granite-7b-lab.jsonl
+        ├── question.jsonl
+        └── reference_answer
+            └── instructlab
+                └── granite-7b-lab.jsonl
+```
+
+```shell
+OPENAI_API_KEY="NO_API_KEY" python3 tests/test_judge_answers.py
+OPENAI_API_KEY="NO_API_KEY" python3 tests/test_pr_bench_judge_answers.py
+```
+
+Example output tree
+
+```shell
+eval_output/
+├── mt_bench
+│   ├── model_answer
+│   │   └── instructlab
+│   │       └── granite-7b-lab.jsonl
+│   └── model_judgment
+│       └── instructlab
+│           └── granite-7b-lab_single.jsonl
+└── pr_bench
+    ├── main
+    │   ├── model_answer
+    │   │   └── instructlab
+    │   │       └── granite-7b-lab.jsonl
+    │   ├── model_judgment
+    │   │   └── instructlab
+    │   │       └── granite-7b-lab_single.jsonl
+    │   ├── question.jsonl
+    │   └── reference_answer
+    │       └── instructlab
+    │           └── granite-7b-lab.jsonl
+    └── rc
+        ├── model_answer
+        │   └── instructlab
+        │       └── granite-7b-lab.jsonl
+        ├── model_judgment
+        │   └── instructlab
+        │       └── granite-7b-lab_single.jsonl
+        ├── question.jsonl
+        └── reference_answer
+            └── instructlab
+                └── granite-7b-lab.jsonl
+```
diff --git a/data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl b/data/mt_bench/model_answer/instructlab/granite-7b-lab.jsonl
diff --git a/data/mt_bench/model_judgment/gpt-4_single.jsonl b/data/mt_bench/model_judgment/gpt-4_single.jsonl
diff --git a/data/mt_bench/reference_answer/gpt-4-turbo.jsonl b/data/mt_bench/reference_answer/gpt-4-turbo.jsonl
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 FastChat
+GitPython>=3.1.42,<4.0.0
 shortuuid
 openai<1.0.0
-anthropic
 psutil
 torch
 transformers
 accelerate
 pandas
+pandas-stubs
diff --git a/src/instructlab/eval/gen_api_answer.py → src/instructlab/eval/answers.py b/src/instructlab/eval/gen_api_answer.py → src/instructlab/eval/answers.py
@@ -1,49 +1,49 @@
-"""Generate answers with GPT-4
-
-Usage:
-python3 gen_api_answer.py --model gpt-3.5-turbo
-"""
-import argparse
+# Standard
+import concurrent.futures
 import json
 import os
 import time
-import concurrent.futures
 
+# Third Party
+# TODO need to look into this dependency
+from fastchat.model.model_adapter import get_conversation_template  # type: ignore
 import openai
 import shortuuid
 import tqdm
 
+# Local
 from .common import (
+    bench_dir,
+    chat_completion_openai,
     load_questions,
     temperature_config,
-    chat_completion_openai,
-    chat_completion_anthropic,
-    chat_completion_palm,
-    ANTHROPIC_MODEL_LIST,
 )
-#TODO need to look into this dependency
-from fastchat.model.model_adapter import get_conversation_template
+
 
 def reorg_answer_file(answer_file):
     """Sort by question id and de-duplication"""
     answers = {}
-    with open(answer_file, "r") as fin:
+    with open(answer_file, "r", encoding="utf-8") as fin:
         for l in fin:
             qid = json.loads(l)["question_id"]
             answers[qid] = l
 
     qids = sorted(list(answers.keys()))
-    with open(answer_file, "w") as fout:
+    with open(answer_file, "w", encoding="utf-8") as fout:
         for qid in qids:
             fout.write(answers[qid])
 
 
 def get_answer(
-        question: dict, model: str, num_choices: int, max_tokens: int, answer_file: str, force_temperature: float
+    question: dict,
+    model: str,
+    num_choices: int,
+    max_tokens: int,
+    answer_file: str,
+    force_temperature: float,
 ):
-    assert (
-        force_temperature is not None and "required_temperature" in question.keys()
-    ) == False
+    """Answer a question with the model"""
+    assert force_temperature is None or question.get("required_temperature") is None
     if force_temperature is not None:
         temperature = force_temperature
     elif "required_temperature" in question.keys():
@@ -54,7 +54,6 @@ def get_answer(
         temperature = 0.7
 
     choices = []
-    chat_state = None  # for palm-2 model
     for i in range(num_choices):
         conv = get_conversation_template(model)
 
@@ -63,14 +62,7 @@ def get_answer(
             conv.append_message(conv.roles[0], question["turns"][j])
             conv.append_message(conv.roles[1], None)
 
-            if model in ANTHROPIC_MODEL_LIST:
-                output = chat_completion_anthropic(model, conv, temperature, max_tokens)
-            elif model == "palm-2-chat-bison-001":
-                chat_state, output = chat_completion_palm(
-                    chat_state, model, conv, temperature, max_tokens
-                )
-            else:
-                output = chat_completion_openai(model, conv, temperature, max_tokens)
+            output = chat_completion_openai(model, conv, temperature, max_tokens)
 
             conv.update_last_message(output)
             turns.append(output)
@@ -87,31 +79,43 @@ def get_answer(
     }
 
     os.makedirs(os.path.dirname(answer_file), exist_ok=True)
-    with open(answer_file, "a") as fout:
+    with open(answer_file, "a", encoding="utf-8") as fout:
         fout.write(json.dumps(ans) + "\n")
 
-def run(
-        question_begin=None,
-        question_end=None,
-        force_temperature=None,
-        answer_file=None,
-        model_name="gpt-3.5-turbo",
-        num_choices=1,
-        max_tokens=1024,
-        parallel=1,
-        openai_api_base=None):
-
-    if openai_api_base is not None:
-        openai.api_base = openai_api_base
-
-    question_file = f"data/mt_bench/question.jsonl"
+
+def generate_answers(
+    model_name,
+    branch=None,
+    output_dir="eval_output",
+    data_dir=None,
+    question_begin=None,
+    question_end=None,
+    force_temperature=None,
+    num_choices=1,
+    max_tokens=1024,
+    max_workers=1,
+    model_api_base=None,
+    bench_name="mt_bench",
+):
+    """Generate model answers to be judged"""
+    if model_api_base is not None:
+        openai.api_base = model_api_base
+
+    if data_dir is None:
+        data_dir = os.path.join(os.path.dirname(__file__), "data")
+
+    data_base_dir = bench_dir(data_dir, bench_name, branch)
+    output_base_dir = bench_dir(output_dir, bench_name, branch)
+
+    question_file = f"{data_base_dir}/question.jsonl"
     questions = load_questions(question_file, question_begin, question_end)
 
-    if not answer_file:
-        answer_file = f"data/mt_bench/model_answer/{model_name}.jsonl"
-    print(f"Output to {answer_file}")
+    answer_file = f"{output_base_dir}/model_answer/{model_name}.jsonl"
+    if os.path.isfile(answer_file):
+        os.remove(answer_file)
+    # print(f"Output to {answer_file}")
 
-    with concurrent.futures.ThreadPoolExecutor(max_workers=parallel) as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
         futures = []
         for question in questions:
             future = executor.submit(