diff --git a/requirements.txt b/requirements.txt index 67b7179..5170e18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ GitPython>=3.1.42,<4.0.0 shortuuid openai>=1.13.3,<2.0.0 psutil +rich==13.7.1 torch transformers accelerate diff --git a/src/instructlab/eval/logger_config.py b/src/instructlab/eval/logger_config.py new file mode 100644 index 0000000..37c958a --- /dev/null +++ b/src/instructlab/eval/logger_config.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# Standard +import logging + +# Third Party +from rich.logging import RichHandler + + +def setup_logger(name): + # Set up the logger + logging.basicConfig( + level=logging.INFO, + format="%(message)s", + datefmt="[%X]", + handlers=[RichHandler()], + ) + logger = logging.getLogger(name) + return logger diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index a3d9f6c..5d6f18a 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -11,6 +11,11 @@ # First Party from instructlab.eval.evaluator import Evaluator +# Local +from .logger_config import setup_logger + +logger = setup_logger(__name__) + MMLU_TASKS = [ "mmlu_abstract_algebra", "mmlu_anatomy", @@ -109,6 +114,7 @@ def run(self) -> tuple: overall_score MMLU score for the overall model evaluation individual_scores Individual MMLU score for each task """ + logger.debug(locals()) # TODO: make this a parameter for class? os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -175,6 +181,7 @@ def run(self) -> tuple: overall_score Average MMLUBranch score for the task group individual_scores Individual MMLUBranch scores for each task in the task group """ + logger.debug(locals()) # TODO: make this a parameter for class? os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/src/instructlab/eval/mt_bench.py b/src/instructlab/eval/mt_bench.py index 729040d..a8cd9a5 100644 --- a/src/instructlab/eval/mt_bench.py +++ b/src/instructlab/eval/mt_bench.py @@ -9,6 +9,9 @@ # Local from .evaluator import Evaluator +from .logger_config import setup_logger + +logger = setup_logger(__name__) class MTBenchEvaluator(Evaluator): @@ -43,6 +46,7 @@ def gen_answers(self, server_url) -> None: Attributes server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated """ + logger.debug(locals()) mt_bench_answers.generate_answers( self.model_name, server_url, @@ -62,6 +66,7 @@ def judge_answers(self, server_url) -> tuple: qa_pairs Question and answer pairs (with scores) from the evaluation turn_scores A list of indexed turn scores """ + logger.debug(locals()) return mt_bench_judgment.generate_judgment( self.model_name, self.judge_model_name, @@ -109,6 +114,7 @@ def gen_answers(self, server_url) -> None: Attributes server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated """ + logger.debug(locals()) mt_bench_branch_generator.generate( self.judge_model_name, self.branch, @@ -135,6 +141,7 @@ def judge_answers(self, server_url) -> tuple: Returns: qa_pairs Question and answer pairs (with scores) from the evaluation """ + logger.debug(locals()) _, qa_pairs, _, error_rate = mt_bench_judgment.generate_judgment( self.model_name, self.judge_model_name, diff --git a/src/instructlab/eval/mt_bench_answers.py b/src/instructlab/eval/mt_bench_answers.py index 908c58e..20de55f 100644 --- a/src/instructlab/eval/mt_bench_answers.py +++ b/src/instructlab/eval/mt_bench_answers.py @@ -12,6 +12,7 @@ import tqdm # Local +from .logger_config import setup_logger from .mt_bench_common import ( bench_dir, chat_completion_openai, @@ -19,9 +20,12 @@ temperature_config, ) +logger = setup_logger(__name__) + def reorg_answer_file(answer_file): """Sort by question id and de-duplication""" + logger.debug(locals()) answers = {} with open(answer_file, "r", encoding="utf-8") as fin: for l in fin: @@ -101,6 +105,7 @@ def generate_answers( bench_name="mt_bench", ): """Generate model answers to be judged""" + logger.debug(locals()) openai_client = openai.OpenAI(base_url=model_api_base, api_key="NO_API_KEY") if data_dir is None: @@ -115,11 +120,13 @@ def generate_answers( answer_file = f"{output_base_dir}/model_answer/{model_name}.jsonl" if os.path.isfile(answer_file): os.remove(answer_file) + logger.debug("Removing previous answer file: %s", answer_file) first_n = None first_n_env = os.environ.get("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS") if first_n_env: first_n = int(first_n_env) + logger.debug("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=%s", first_n) with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [] diff --git a/src/instructlab/eval/mt_bench_branch_generator.py b/src/instructlab/eval/mt_bench_branch_generator.py index bff984e..f4f1b9b 100644 --- a/src/instructlab/eval/mt_bench_branch_generator.py +++ b/src/instructlab/eval/mt_bench_branch_generator.py @@ -11,10 +11,14 @@ import yaml # Local +from .logger_config import setup_logger from .mt_bench_common import bench_dir +logger = setup_logger(__name__) + def get_file_paths(directory): + logger.debug(locals()) file_paths = [] for root, _, files in os.walk(directory): for file in files: @@ -31,6 +35,7 @@ def read_qna(fn): def generate(judge_model_name, branch, taxonomy_dir, output_dir): """Create questions and reference answers from taxonomy""" + logger.debug(locals()) restore_branch = None try: if branch is not None: @@ -89,20 +94,21 @@ def generate(judge_model_name, branch, taxonomy_dir, output_dir): } ) - print(f"generated {len(question_lst)} questions") + logger.debug("Generated %s questions", len(question_lst)) output_base_dir = bench_dir(output_dir, "mt_bench_branch", branch) os.makedirs(output_base_dir, exist_ok=True) question_fn = "question.jsonl" - with open( - os.path.join(output_base_dir, question_fn), "w", encoding="utf-8" - ) as outfile: + question_file = os.path.join(output_base_dir, question_fn) + logger.debug("Generating question file: %s", question_file) + with open(question_file, "w", encoding="utf-8") as outfile: for entry in question_lst: json.dump(entry, outfile) outfile.write("\n") answer_file_dir = os.path.join(output_base_dir, "reference_answer") answer_file = os.path.join(answer_file_dir, f"{judge_model_name}.jsonl") + logger.debug("Generating answer file: %s", answer_file) os.makedirs(os.path.dirname(answer_file), exist_ok=True) with open( answer_file, diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py index efbf781..22b0577 100644 --- a/src/instructlab/eval/mt_bench_common.py +++ b/src/instructlab/eval/mt_bench_common.py @@ -16,6 +16,11 @@ from fastchat.model.model_adapter import get_conversation_template # type: ignore import openai +# Local +from .logger_config import setup_logger + +logger = setup_logger(__name__) + # API setting constants API_MAX_RETRY = 4 API_RETRY_SLEEP = 4 @@ -84,6 +89,7 @@ def load_model_answers(answer_dir: str, model_name=None) -> dict: The return value is a python dict of type: Dict[model_name: str -> Dict[question_id: int -> answer: dict]] """ + logger.debug(locals()) model_answers = {} for root, _, files in os.walk(answer_dir): for filename in files: @@ -98,6 +104,7 @@ def load_model_answers(answer_dir: str, model_name=None) -> dict: answer[l["question_id"]] = l model_answers[model_name or file_model_name] = answer if model_name == file_model_name: + logger.debug("Found answer file matching: %s", model_name) break return model_answers @@ -108,6 +115,7 @@ def load_judge_prompts(prompt_file: str) -> dict: The return value is a python dict of type: Dict[judge_name: str -> dict] """ + logger.debug(locals()) prompts = {} with open(prompt_file, encoding="utf-8") as fin: for line in fin: @@ -163,6 +171,11 @@ def run_judge_single( rating = ast.literal_eval(match.groups()[0]) else: rating = -1 + logger.debug( + "Received invalid judgment for question %s with judgment: %s", + question["question_id"], + judgment, + ) else: raise ValueError( f"invalid output format: {judge.prompt_template['output_format']}" @@ -245,6 +258,8 @@ def chat_completion_openai(openai_client, model, conv, temperature, max_tokens) if i == API_MAX_RETRY - 1: # Print error on last try print(type(e), e) + else: + logger.debug(e) time.sleep(API_RETRY_SLEEP) return output @@ -272,6 +287,7 @@ def check_data(questions, model_answers, ref_answers, models, judges): def get_model_list(answer_dir): + logger.debug(locals()) file_paths = glob.glob(f"{answer_dir}/*.jsonl") file_names = [os.path.splitext(os.path.basename(f))[0] for f in file_paths] return file_names diff --git a/src/instructlab/eval/mt_bench_judgment.py b/src/instructlab/eval/mt_bench_judgment.py index d7ce35c..b8acb81 100644 --- a/src/instructlab/eval/mt_bench_judgment.py +++ b/src/instructlab/eval/mt_bench_judgment.py @@ -9,6 +9,7 @@ import pandas as pd # Local +from .logger_config import setup_logger from .mt_bench_common import ( NEED_REF_CATS, Judge, @@ -22,6 +23,8 @@ play_a_match_single, ) +logger = setup_logger(__name__) + def make_match_single( questions, @@ -77,6 +80,7 @@ def make_judgment( bench_name="mt_bench", ): """Create judgment output""" + logger.debug(locals()) judgment_df_all = pd.read_json( judgment_file, lines=True, dtype={"question_id": str} ) @@ -85,6 +89,9 @@ def make_judgment( judgment_df = judgment_df[judgment_df["score"] != -1] error_free_judgments_len = len(judgment_df) error_rate = (judgments_len - error_free_judgments_len) / judgments_len + logger.debug("#judgments: %s", judgments_len) + logger.debug("#error free judgments: %s", error_free_judgments_len) + logger.debug("error rate: %s", error_rate) turn_scores = [] # First turn @@ -152,6 +159,7 @@ def judge_model( first_n=None, ): """Judge the model based on questions and reference answers""" + logger.debug(locals()) package_data_dir = os.path.join(os.path.dirname(__file__), "data") if data_dir is None: data_dir = package_data_dir @@ -188,6 +196,7 @@ def judge_model( output_file = f"{output_base_dir}/model_judgment/{judge_model_name}_single.jsonl" if os.path.isfile(output_file): os.remove(output_file) + logger.debug("Removing previous judgment file: %s", output_file) check_data(questions, model_answers, ref_answers, models, judges) @@ -264,11 +273,13 @@ def generate_judgment( first_n=None, ): """Generate judgment with scores and qa_pairs for a model""" + logger.debug(locals()) openai_client = openai.OpenAI(base_url=model_api_base, api_key="NO_API_KEY") first_n_env = os.environ.get("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS") if first_n_env is not None and first_n is None: first_n = int(first_n_env) + logger.debug("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=%s", first_n) question_file, judgment_file, answer_file = judge_model( model_name,