Skip to content

Commit

Permalink
Merge pull request #55 from danmcp/logging
Browse files Browse the repository at this point in the history
Adding basic logging facilities for eval with a first pass at some useful logging
  • Loading branch information
JamesKunstle authored Jul 8, 2024
2 parents 76eb1e4 + 0fb699f commit 450acaf
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 4 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ GitPython>=3.1.42,<4.0.0
shortuuid
openai>=1.13.3,<2.0.0
psutil
rich==13.7.1
torch
transformers
accelerate
Expand Down
18 changes: 18 additions & 0 deletions src/instructlab/eval/logger_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# Standard
import logging

# Third Party
from rich.logging import RichHandler


def setup_logger(name):
# Set up the logger
logging.basicConfig(
level=logging.INFO,
format="%(message)s",
datefmt="[%X]",
handlers=[RichHandler()],
)
logger = logging.getLogger(name)
return logger
7 changes: 7 additions & 0 deletions src/instructlab/eval/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
# First Party
from instructlab.eval.evaluator import Evaluator

# Local
from .logger_config import setup_logger

logger = setup_logger(__name__)

MMLU_TASKS = [
"mmlu_abstract_algebra",
"mmlu_anatomy",
Expand Down Expand Up @@ -109,6 +114,7 @@ def run(self) -> tuple:
overall_score MMLU score for the overall model evaluation
individual_scores Individual MMLU score for each task
"""
logger.debug(locals())
# TODO: make this a parameter for class?
os.environ["TOKENIZERS_PARALLELISM"] = "true"

Expand Down Expand Up @@ -175,6 +181,7 @@ def run(self) -> tuple:
overall_score Average MMLUBranch score for the task group
individual_scores Individual MMLUBranch scores for each task in the task group
"""
logger.debug(locals())
# TODO: make this a parameter for class?
os.environ["TOKENIZERS_PARALLELISM"] = "true"

Expand Down
7 changes: 7 additions & 0 deletions src/instructlab/eval/mt_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@

# Local
from .evaluator import Evaluator
from .logger_config import setup_logger

logger = setup_logger(__name__)


class MTBenchEvaluator(Evaluator):
Expand Down Expand Up @@ -46,6 +49,7 @@ def gen_answers(self, server_url) -> None:
Attributes
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
"""
logger.debug(locals())
mt_bench_answers.generate_answers(
self.model_name,
server_url,
Expand All @@ -65,6 +69,7 @@ def judge_answers(self, server_url) -> tuple:
qa_pairs Question and answer pairs (with scores) from the evaluation
turn_scores A list of indexed turn scores
"""
logger.debug(locals())
return mt_bench_judgment.generate_judgment(
self.model_name,
self.judge_model_name,
Expand Down Expand Up @@ -116,6 +121,7 @@ def gen_answers(self, server_url) -> None:
Attributes
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
"""
logger.debug(locals())
mt_bench_branch_generator.generate(
self.judge_model_name,
self.branch,
Expand All @@ -142,6 +148,7 @@ def judge_answers(self, server_url) -> tuple:
Returns:
qa_pairs Question and answer pairs (with scores) from the evaluation
"""
logger.debug(locals())
_, qa_pairs, _, error_rate = mt_bench_judgment.generate_judgment(
self.model_name,
self.judge_model_name,
Expand Down
7 changes: 7 additions & 0 deletions src/instructlab/eval/mt_bench_answers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,20 @@
import tqdm

# Local
from .logger_config import setup_logger
from .mt_bench_common import (
bench_dir,
chat_completion_openai,
load_questions,
temperature_config,
)

logger = setup_logger(__name__)


def reorg_answer_file(answer_file):
"""Sort by question id and de-duplication"""
logger.debug(locals())
answers = {}
with open(answer_file, "r", encoding="utf-8") as fin:
for l in fin:
Expand Down Expand Up @@ -106,6 +110,7 @@ def generate_answers(
bench_name="mt_bench",
):
"""Generate model answers to be judged"""
logger.debug(locals())
openai_client = openai.OpenAI(base_url=model_api_base, api_key="NO_API_KEY")

if data_dir is None:
Expand All @@ -120,11 +125,13 @@ def generate_answers(
answer_file = f"{output_base_dir}/model_answer/{model_name}.jsonl"
if os.path.isfile(answer_file):
os.remove(answer_file)
logger.debug("Removing previous answer file: %s", answer_file)

first_n = None
first_n_env = os.environ.get("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS")
if first_n_env:
first_n = int(first_n_env)
logger.debug("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=%s", first_n)

with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
Expand Down
14 changes: 10 additions & 4 deletions src/instructlab/eval/mt_bench_branch_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,14 @@
import yaml

# Local
from .logger_config import setup_logger
from .mt_bench_common import bench_dir

logger = setup_logger(__name__)


def get_file_paths(directory):
logger.debug(locals())
file_paths = []
for root, _, files in os.walk(directory):
for file in files:
Expand All @@ -32,6 +36,7 @@ def read_qna(fn):

def generate(judge_model_name, branch, taxonomy_dir, output_dir):
"""Create questions and reference answers from taxonomy"""
logger.debug(locals())
restore_branch = None
try:
if branch is not None:
Expand Down Expand Up @@ -90,20 +95,21 @@ def generate(judge_model_name, branch, taxonomy_dir, output_dir):
}
)

print(f"generated {len(question_lst)} questions")
logger.debug("Generated %s questions", len(question_lst))

output_base_dir = bench_dir(output_dir, "mt_bench_branch", branch)
os.makedirs(output_base_dir, exist_ok=True)
question_fn = "question.jsonl"
with open(
os.path.join(output_base_dir, question_fn), "w", encoding="utf-8"
) as outfile:
question_file = os.path.join(output_base_dir, question_fn)
logger.debug("Generating question file: %s", question_file)
with open(question_file, "w", encoding="utf-8") as outfile:
for entry in question_lst:
json.dump(entry, outfile)
outfile.write("\n")

answer_file_dir = os.path.join(output_base_dir, "reference_answer")
answer_file = os.path.join(answer_file_dir, f"{judge_model_name}.jsonl")
logger.debug("Generating answer file: %s", answer_file)
os.makedirs(os.path.dirname(answer_file), exist_ok=True)
with open(
answer_file,
Expand Down
16 changes: 16 additions & 0 deletions src/instructlab/eval/mt_bench_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
from fastchat.model.model_adapter import get_conversation_template # type: ignore
import openai

# Local
from .logger_config import setup_logger

logger = setup_logger(__name__)

# API setting constants
API_MAX_RETRY = 4
API_RETRY_SLEEP = 4
Expand Down Expand Up @@ -85,6 +90,7 @@ def load_model_answers(answer_dir: str, model_name=None) -> dict:
The return value is a python dict of type:
Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
"""
logger.debug(locals())
model_answers = {}
for root, _, files in os.walk(answer_dir):
for filename in files:
Expand All @@ -99,6 +105,7 @@ def load_model_answers(answer_dir: str, model_name=None) -> dict:
answer[l["question_id"]] = l
model_answers[model_name or file_model_name] = answer
if model_name == file_model_name:
logger.debug("Found answer file matching: %s", model_name)
break
return model_answers

Expand All @@ -109,6 +116,7 @@ def load_judge_prompts(prompt_file: str) -> dict:
The return value is a python dict of type:
Dict[judge_name: str -> dict]
"""
logger.debug(locals())
prompts = {}
with open(prompt_file, encoding="utf-8") as fin:
for line in fin:
Expand Down Expand Up @@ -176,6 +184,11 @@ def run_judge_single(
rating = ast.literal_eval(match.groups()[0])
else:
rating = -1
logger.debug(
"Received invalid judgment for question %s with judgment: %s",
question["question_id"],
judgment,
)
else:
raise ValueError(
f"invalid output format: {judge.prompt_template['output_format']}"
Expand Down Expand Up @@ -263,6 +276,8 @@ def chat_completion_openai(
if i == API_MAX_RETRY - 1:
# Print error on last try
print(type(e), e)
else:
logger.debug(e)
time.sleep(API_RETRY_SLEEP)

return output
Expand Down Expand Up @@ -290,6 +305,7 @@ def check_data(questions, model_answers, ref_answers, models, judges):


def get_model_list(answer_dir):
logger.debug(locals())
file_paths = glob.glob(f"{answer_dir}/*.jsonl")
file_names = [os.path.splitext(os.path.basename(f))[0] for f in file_paths]
return file_names
11 changes: 11 additions & 0 deletions src/instructlab/eval/mt_bench_judgment.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pandas as pd

# Local
from .logger_config import setup_logger
from .mt_bench_common import (
NEED_REF_CATS,
Judge,
Expand All @@ -23,6 +24,8 @@
play_a_match_single,
)

logger = setup_logger(__name__)


def make_match_single(
questions,
Expand Down Expand Up @@ -78,6 +81,7 @@ def make_judgment(
bench_name="mt_bench",
):
"""Create judgment output"""
logger.debug(locals())
judgment_df_all = pd.read_json(
judgment_file, lines=True, dtype={"question_id": str}
)
Expand All @@ -86,6 +90,9 @@ def make_judgment(
judgment_df = judgment_df[judgment_df["score"] != -1]
error_free_judgments_len = len(judgment_df)
error_rate = (judgments_len - error_free_judgments_len) / judgments_len
logger.debug("#judgments: %s", judgments_len)
logger.debug("#error free judgments: %s", error_free_judgments_len)
logger.debug("error rate: %s", error_rate)

turn_scores = []
# First turn
Expand Down Expand Up @@ -154,6 +161,7 @@ def judge_model(
merge_system_user_message=False,
):
"""Judge the model based on questions and reference answers"""
logger.debug(locals())
package_data_dir = os.path.join(os.path.dirname(__file__), "data")
if data_dir is None:
data_dir = package_data_dir
Expand Down Expand Up @@ -190,6 +198,7 @@ def judge_model(
output_file = f"{output_base_dir}/model_judgment/{judge_model_name}_single.jsonl"
if os.path.isfile(output_file):
os.remove(output_file)
logger.debug("Removing previous judgment file: %s", output_file)

check_data(questions, model_answers, ref_answers, models, judges)

Expand Down Expand Up @@ -277,11 +286,13 @@ def generate_judgment(
merge_system_user_message=False,
):
"""Generate judgment with scores and qa_pairs for a model"""
logger.debug(locals())
openai_client = openai.OpenAI(base_url=model_api_base, api_key="NO_API_KEY")

first_n_env = os.environ.get("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS")
if first_n_env is not None and first_n is None:
first_n = int(first_n_env)
logger.debug("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=%s", first_n)

question_file, judgment_file, answer_file = judge_model(
model_name,
Expand Down

0 comments on commit 450acaf

Please sign in to comment.