diff --git a/jlm_fin_eval/__init__.py b/jlm_fin_eval/__init__.py new file mode 100644 index 0000000..bb5cee6 --- /dev/null +++ b/jlm_fin_eval/__init__.py @@ -0,0 +1 @@ +from .api import metrics diff --git a/jlm_fin_eval/api/metrics.py b/jlm_fin_eval/api/metrics.py new file mode 100644 index 0000000..d8947c2 --- /dev/null +++ b/jlm_fin_eval/api/metrics.py @@ -0,0 +1,69 @@ +from typing import Tuple + +import numpy as np +from lm_eval.api.registry import register_aggregation +from lm_eval.api.registry import register_metric +from sklearn.metrics import f1_score + + +@register_aggregation("macro_f1_score") +def macro_f1_score(items: Tuple) -> float | np.ndarray: + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore + + +@register_aggregation("2class_adjusted_macro_f1_score_for_chabsa") +def two_class_adjusted_macro_f1_score_for_chabsa(items: Tuple) -> float | np.ndarray: + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") * 1.5 + return fscore + + +@register_metric( + metric="f1_norm", + higher_is_better=True, + output_type="multiple_choice", +) +def f1_norm_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="map", + higher_is_better=True, + output_type="multiple_choice", +) +def map_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="map_2", + higher_is_better=True, + output_type="multiple_choice", +) +def map_2_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="map_3", + higher_is_better=True, + output_type="multiple_choice", +) +def map_3_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="map_4", + higher_is_better=True, + output_type="multiple_choice", +) +def map_4_fn(items): # This is a passthrough function + return items diff --git a/jlm_fin_eval/evaluator.py b/jlm_fin_eval/evaluator.py deleted file mode 100644 index 46d9598..0000000 --- a/jlm_fin_eval/evaluator.py +++ /dev/null @@ -1,396 +0,0 @@ -import collections -import itertools -import random -from typing import Any -from typing import Dict -from typing import List -from typing import Optional -from typing import Union -from typing import cast - -import lm_eval.base -import lm_eval.metrics -import lm_eval.models -import numpy as np -from lm_eval.base import LM -from lm_eval.base import CachingLM -from lm_eval.base import Task -from lm_eval.utils import positional_deprecated -from lm_eval.utils import run_task_tests - -import jlm_fin_eval.tasks - - -@positional_deprecated -def simple_evaluate( - model: Union[str, LM], - model_args: Optional[str] = None, - tasks: List[Union[str, Task]] = [], - num_fewshot: Union[List[int], int] = 0, - batch_size: Optional[int] = None, - device: Optional[str] = None, - no_cache: bool = False, - limit: Union[int, List[int | None], None] = None, - bootstrap_iters: int = 100000, - description_dict: Optional[Dict[str, str]] = None, - check_integrity: bool = False, - decontamination_ngrams_path: Optional[Dict[str, str]] = None, - verbose: bool = False, -): - """Instantiate and evaluate a model on a list of tasks. - - :param model: Union[str, LM] - Name of model or LM object, see lm_eval.models.get_model - :param model_args: Optional[str] - String arguments for each model class, see LM.create_from_arg_string. - Ignored if `model` argument is a LM object. - :param tasks: list[Union[str, Task]] - List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. - :param num_fewshot: int or list of int - Number of examples in few-shot context - :param batch_size: int, optional - Batch size for model - :param device: str, optional - PyTorch device (e.g. "cpu" or "cuda:0") for running models - :param no_cache: bool - Whether or not to cache - :param limit: int or list of int, optional - Limit the number of examples per task (only use this for testing) - :param bootstrap_iters: - Number of iterations for bootstrap statistics - :param description_dict: dict[str, str] - Dictionary of custom task descriptions of the form: `task_name: description` - :param check_integrity: bool - Whether to run the relevant part of the test suite for the tasks - :return - Dictionary of results - """ - random.seed(1234) - np.random.seed(1234) - - assert tasks != [], "No tasks specified" - - if isinstance(model, str): - if model_args is None: - model_args = "" - lm = lm_eval.models.get_model(model).create_from_arg_string( - model_args, {"batch_size": batch_size, "device": device} - ) - else: - assert isinstance(model, lm_eval.base.LM) - lm = model - - if not no_cache: - if model_args is None: - model_args = "" - lm = lm_eval.base.CachingLM( - lm, - "lm_cache/" - + str(model) - + "_" - + model_args.replace("=", "-").replace(",", "_").replace("/", "-") - + ".db", - ) - - task_dict = jlm_fin_eval.tasks.get_task_dict(tasks) - - if check_integrity: - run_task_tests(task_list=[str(task) for task in tasks]) - - results: Dict[str, Any] = evaluate( - lm=lm, - task_dict=task_dict, - num_fewshot=num_fewshot, - limit=limit, - bootstrap_iters=bootstrap_iters, - description_dict=description_dict, - decontamination_ngrams_path=decontamination_ngrams_path, - verbose=verbose, - ) - - # add info about the model and few shot config - results["config"] = { - "model": model, - "model_args": model_args, - "num_fewshot": num_fewshot, - "batch_size": batch_size, - "device": device, - "no_cache": no_cache, - "limit": limit, - "bootstrap_iters": bootstrap_iters, - "description_dict": description_dict, - } - - return results - - -decontaminate_suffix = "_decontaminate" - - -@positional_deprecated -def evaluate( - lm: Union[LM, CachingLM], - task_dict: Dict[str, Task], - provide_description: Optional[bool] = None, - num_fewshot: Union[List[int], int] = 0, - limit: Union[int, List[int | None], None] = None, - bootstrap_iters: int = 100000, - description_dict: Optional[Dict[str, str]] = None, - decontamination_ngrams_path: Optional[Dict[str, str]] = None, - verbose: bool = False, -): - """Instantiate and evaluate a model on a list of tasks. - - :param lm: obj - Language Model - :param task_dict: dict[str, Task] - Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. - :param provide_description: bool - Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method - :param num_fewshot: int or list of int - Number of examples in few-shot context - :param limit: int or list of int, optional - Limit the number of examples per task (only use this for testing) - :param bootstrap_iters: - Number of iterations for bootstrap statistics - :param description_dict: dict[str, str] - Dictionary of custom task descriptions of the form: `task_name: description` - :return - Dictionary of results - """ - # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces - - # TODO: todo: implement proper description-providing system - assert not provide_description # not implemented. - if provide_description is not None: - # nudge people to not specify it at all - print( - "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict" - ) - if isinstance(num_fewshot, list): - assert len(task_dict) == len( - num_fewshot - ), f"The number of tasks ({len(task_dict)}) must be same as the number of elements in `num_fewshot` ({len(num_fewshot)})" - else: - # num_fewshot is int - num_fewshot = [num_fewshot] * len(task_dict) - if isinstance(limit, list): - assert len(task_dict) == len( - limit - ), f"The number of tasks ({len(task_dict)}) must be same as the number of elements in `num_fewshot` ({len(limit)})" - else: - # limit is int or None - limit = [limit] * len(task_dict) - - decontaminate = decontamination_ngrams_path is not None - - task_dict_items = [ - (name, task) - for name, task in task_dict.items() - if (task.has_validation_docs() or task.has_test_docs()) - ] - - results = collections.defaultdict(dict) - versions = collections.defaultdict(dict) - - requests = collections.defaultdict(list) - requests_origin = collections.defaultdict(list) - - overlaps = collections.defaultdict(list) # {task_name: contaminated_docs} - - # If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger - # memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because - # over-engineering is bad (or we could make it write the requests to disk and then read them back out again - # - probably using an sqlite db because of all the moving parts we have - - # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable - docs = {} - - docs_for_decontamination = collections.defaultdict(list) - - # get lists of each type of request - for idx, (task_name, task) in enumerate(task_dict_items): - versions[task_name] = task.VERSION # type: ignore - # default to test doc, fall back to val doc if validation unavailable - # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point - if task.has_test_docs(): - task_doc_func = task.test_docs - task_set = "test" # Required for caching in the decontamination - elif task.has_validation_docs(): - task_set = "val" # Required for caching in the decontamination - task_doc_func = task.validation_docs - else: - raise RuntimeError("Task has neither test_docs nor validation_docs") - - # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order - task_docs = list(task_doc_func()) - rnd = random.Random() - rnd.seed(42) - rnd.shuffle(task_docs) - - description = ( - description_dict[task_name] - if description_dict and task_name in description_dict - else "" - ) - # set tokenizer inside task - if task.LOAD_TOKENIZER: - if isinstance(lm, lm_eval.base.CachingLM): - task.set_tokenizer(lm.lm.tokenizer) - else: - task.set_tokenizer(lm.tokenizer) # type: ignore - # set max_length to task object - task.max_length = ( # type: ignore - lm.lm.max_length - if isinstance(lm, lm_eval.base.CachingLM) - else lm.max_length # type: ignore - ) - task.max_gen_toks = ( # type: ignore - lm.lm.max_gen_toks - if isinstance(lm, lm_eval.base.CachingLM) - else lm.max_gen_toks # type: ignore - ) - - limit_local = limit[idx] - if isinstance(limit_local, float): - limit_local = int(limit_local * len(task_docs)) - print( - f"Use {limit_local}/{len(task_docs)} samples corresponding to the ratio of {limit[idx]}" - ) - for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit_local)): - if decontaminate and task.should_decontaminate(): - docs_for_decontamination[(task_name, task_set)].append( - task.doc_to_decontamination_query(doc) - ) - - docs[(task_name, doc_id)] = doc - ctx = task.fewshot_context( - doc=doc, num_fewshot=num_fewshot[idx], rnd=rnd, description=description - ) - reqs = task.construct_requests(doc, ctx) - if not isinstance(reqs, (list, tuple)): - reqs = [reqs] - for i, req in enumerate(reqs): - requests[req.request_type].append(req) # type: ignore - # i: index in requests for a single task instance - # doc_id: unique id that we can get back to a doc using `docs` - requests_origin[req.request_type].append((i, task_name, doc, doc_id)) # type: ignore - - # Compare all tasks/sets at once to ensure a single training set scan - if decontaminate: - from lm_eval.decontamination.decontaminate import get_train_overlap - - print("Finding train/test overlap, please wait...") - overlaps = get_train_overlap( - docs_for_decontamination, decontamination_ngrams_path, limit - ) - - # all responses for each (task, doc) - process_res_queue = collections.defaultdict(list) - - # execute each type of request - for reqtype, reqs in requests.items(): - # TODO: right now, this code runs multiple separate LM requests for multiple Requests differing - # only in index. We could implement some kind of caching, but that would be more of a band-aid - # solution. we could also implement some kind of auto-grouping here; - # they should end up next to each other. - - print("Running", reqtype, "requests") - resps = getattr(lm, reqtype)([req.args for req in reqs]) - resps = [ - x if req.index is None else x[req.index] for x, req in zip(resps, reqs) - ] - - for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]): - process_res_queue[(task_name, doc_id)].append((i, resp)) - - vals = collections.defaultdict(list) - # holds detailed responses for error analysis - details = collections.defaultdict(list) - - # unpack results and sort back in order and return control to Task - for (task_name, doc_id), requests in process_res_queue.items(): - requests.sort(key=lambda x: x[0]) - requests = [x[1] for x in requests] - - task = task_dict[task_name] - doc = docs[(task_name, doc_id)] - - metrics: Dict[str, Any] = cast( - Dict[str, Any], task.process_results(doc, requests) - ) - if "details" in metrics: - details[task_name].append(metrics["details"]) - del metrics["details"] - for metric, value in metrics.items(): - vals[(task_name, metric)].append(value) - - # Re-use the evaluation for the decontaminated set by just ignoring the overlaps - if decontaminate and task_name in overlaps: - if doc_id not in overlaps[task_name]: - vals[(task_name, metric + decontaminate_suffix)].append(value) - - # aggregate results - for (task_name, metric), items in vals.items(): - task: Task = task_dict[task_name] - real_metric = metric # key when looking up the metric with task.aggregation - if metric.endswith(decontaminate_suffix): - real_metric = metric.replace( - decontaminate_suffix, "" - ) # decontaminated still uses the same metric - results[task_name][metric] = cast(Dict, task.aggregation())[real_metric](items) - - # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap - # so we run them less iterations. still looking for a cleaner way to do this - - stderr = lm_eval.metrics.stderr_for_metric( - metric=cast(Dict, task.aggregation())[real_metric], - bootstrap_iters=min(bootstrap_iters, 1000) - if metric in ["bleu", "chrf", "ter"] - else bootstrap_iters, - ) - - if stderr is not None: - results[task_name][metric + "_stderr"] = stderr(items) - - if verbose and task_name in details: - results[task_name]["details"] = details[task_name] - - return {"results": dict(results), "versions": dict(versions)} - - -def make_table(result_dict): - """Generate table of results.""" - from pytablewriter import LatexTableWriter - from pytablewriter import MarkdownTableWriter - - md_writer = MarkdownTableWriter() - latex_writer = LatexTableWriter() - md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] - latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] - - values = [] - - for k, dic in result_dict["results"].items(): - version = result_dict["versions"][k] - for m, v in dic.items(): - if m == "details": - continue - - if m.endswith("_stderr"): - continue - - if m + "_stderr" in dic: - se = dic[m + "_stderr"] - values.append([k, version, m, "%.4f" % v, "±", "%.4f" % se]) - else: - values.append([k, version, m, "%.4f" % v, "", ""]) - k = "" - version = "" - md_writer.value_matrix = values - latex_writer.value_matrix = values - - # todo: make latex table look good - # print(latex_writer.dumps()) - - return md_writer.dumps() diff --git a/jlm_fin_eval/tasks/__init__.py b/jlm_fin_eval/tasks/__init__.py index b44525b..c280a56 100644 --- a/jlm_fin_eval/tasks/__init__.py +++ b/jlm_fin_eval/tasks/__init__.py @@ -1,58 +1,33 @@ -from pprint import pprint -from typing import List -from typing import Union - -import lm_eval.base - -from . import chabsa -from . import cma_basics -from . import cpa_audit -from . import fp2 -from . import security_sales_1 - -TASK_REGISTRY = { - **cma_basics.construct_tasks(), - **security_sales_1.construct_tasks(), - **chabsa.construct_tasks(), - **cpa_audit.construct_tasks(), - **fp2.construct_tasks(), -} - -ALL_TASKS = sorted(list(TASK_REGISTRY)) - - -def get_task(task_name): - try: - return TASK_REGISTRY[task_name] - except KeyError: - print("Available tasks:") - pprint(TASK_REGISTRY) - raise KeyError(f"Missing task {task_name}") - - -def get_task_name_from_object(task_object): - for name, class_ in TASK_REGISTRY.items(): - if class_ is task_object: - return name - - # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting - return ( - task_object.EVAL_HARNESS_NAME - if hasattr(task_object, "EVAL_HARNESS_NAME") - else type(task_object).__name__ - ) - - -def get_task_dict(task_name_list: List[Union[str, lm_eval.base.Task]]): - task_name_dict = { - task_name: get_task(task_name)() - for task_name in task_name_list - if isinstance(task_name, str) - } - task_name_from_object_dict = { - get_task_name_from_object(task_object): task_object - for task_object in task_name_list - if not isinstance(task_object, str) - } - assert set(task_name_dict.keys()).isdisjoint(set(task_name_from_object_dict.keys())) - return {**task_name_dict, **task_name_from_object_dict} +import os +from typing import Optional + +from lm_eval.tasks import TaskManager as OriginalTaskManager + + +class TaskManager(OriginalTaskManager): + """TaskManager indexes all tasks from the default `lm_eval/tasks/` + and an optional directory if provided. + + """ + + def initialize_tasks(self, include_path: Optional[str] = None) -> dict: + """Creates a dictionary of tasks index. + + :param include_path: str = None + An additional path to be searched for tasks + + :return + Dictionary of task names as key and task metadata + """ + all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"] + if include_path is not None: + if isinstance(include_path, str): + include_path = [include_path] + all_paths.extend(include_path) + + task_index = {} + for task_dir in all_paths: + tasks = self._get_task_and_group(task_dir) + task_index = {**tasks, **task_index} + + return task_index diff --git a/jlm_fin_eval/tasks/chabsa.py b/jlm_fin_eval/tasks/chabsa.py deleted file mode 100644 index e43a070..0000000 --- a/jlm_fin_eval/tasks/chabsa.py +++ /dev/null @@ -1,193 +0,0 @@ -import inspect -import os -from functools import partial - -import numpy as np -from lm_eval.base import MultipleChoiceTask -from lm_eval.base import rf -from sklearn.metrics import accuracy_score -from sklearn.metrics import f1_score - -import jlm_fin_eval.datasets.chabsa.chabsa - - -class Chabsa(MultipleChoiceTask): - VERSION = 1.0 - DATASET_PATH = inspect.getfile(jlm_fin_eval.datasets.chabsa.chabsa) - DATASET_NAME = "chABSA" - DESCRIPTION = "以下のセンテンスにおける、ターゲットのセンチメントをpositiveかnegativeで答えてください。\n\n" - - def has_training_docs(self): - return False - - def has_validation_docs(self): - return False - - def has_test_docs(self): - return True - - def training_docs(self): - return None - - def validation_docs(self): - return None - - def test_docs(self): - return self.dataset["test"] - - def doc_to_text(self, doc): - doc_text = f"センテンス: {doc['sentence']}\n" - doc_text += f"ターゲット: {doc['target']}\n" - doc_text += "回答: " - return doc_text - - def doc_to_target(self, doc): - answer = doc["polarity"] - return answer - - @staticmethod - def get_answer(doc): - return doc["polarity"] - - @staticmethod - def compute_scores(gold, pred): - acc = accuracy_score(gold, pred) - - return {"acc": acc} - - def construct_requests(self, doc, ctx): - lls = [rf.loglikelihood(ctx, choice)[0] for choice in ["positive", "negative"]] - - return lls - - def process_results(self, doc, results): - gold = doc["polarity"] - pred = ["positive", "negative"][np.array(results).argmax()] - - out = { - "acc": ( - pred, - gold, - ), - "f1": ( - pred, - gold, - ), - } - return out - - def higher_is_better(self): - return {"acc": True, "f1": True} - - def aggregation(self): - return { - "acc": partial(self._chabsa_agg, "acc"), - "f1": partial(self._chabsa_agg, "f1"), - } - - def _chabsa_agg(self, key, item): - predictions, references = zip(*item) - if key == "acc": - return (np.asarray(predictions) == np.asarray(references)).mean() - elif key == "f1": - return f1_score(references, predictions, average="macro") * 1.5 - else: - raise KeyError(key) - - -class ChabsaWithAnlpPrompt(Chabsa): - PROMPT_VERSION = 0.1 - DESCRIPTION = "[センテンス]における、[ターゲット]のセンチメントをpositiveかnegativeで選んでください。\n\n" - - def doc_to_text(self, doc): - doc_text = f"[センテンス]: {doc['sentence']}\n" - doc_text += f"[ターゲット]: {doc['target']}\n" - doc_text += "[答え]:" - return doc_text - - -class ChabsaWithFintanPrompt(Chabsa): - PROMPT_VERSION = 0.2 - DESCRIPTION = ( - "センテンスとターゲットを入力として受け取り、ターゲットに関するセンチメントをpositiveかnegativeから選択してください。\n\n" - ) - - def doc_to_text(self, doc): - doc_text = f"センテンス:{doc['sentence']}\n" - doc_text += f"ターゲット:{doc['target']}\n" - doc_text += "回答:" - return doc_text - - -class ChabsaWithAlpacaPrompt(Chabsa): - PROMPT_VERSION = 0.3 - DESCRIPTION = """以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。 - -### 指示: -以下のセンテンスにおける、ターゲットのセンチメントをpositiveかnegativeで答えてください。 - -""" - - def doc_to_text(self, doc): - doc_text = f"""### 入力: -センテンス:{doc['sentence']} -ターゲット:{doc['target']} - -### 応答: -""" - return doc_text - - -class ChabsaWithRinnaInstructionSFT(Chabsa): - PROMPT_VERSION = 0.4 - DESCRIPTION = ( - "ユーザー: センテンスにおける、ターゲットのセンチメントをpositiveかnegativeで答えてください。システム: 分かりました。" - ) - SEP = "" - FEWSHOT_SEP = "" - - def doc_to_text(self, doc): - doc_text = ( - f"センテンス: {doc['sentence']}{self.SEP}ターゲット: {doc['target']}{self.SEP}システム: " - ) - return doc_text - - -class ChabsaWithRinnaBilingualInstructionSFT(ChabsaWithRinnaInstructionSFT): - PROMPT_VERSION = 0.5 - DESCRIPTION = "ユーザー: 与えられた文脈から、質問に対する答えを抜き出してください。\nシステム: 分かりました。\n" - SEP = "\n" - FEWSHOT_SEP = "\n" - - -class ChabsaWithLlama2(Chabsa): - PROMPT_VERSION = 0.6 - DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" - SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", DEFAULT_SYSTEM_PROMPT) - DESCRIPTION = f"[INST] <>\n{SYSTEM_PROMPT}\n<>\n\n" - INSTRUCTION = "与えられたセンテンスにおける、ターゲットのセンチメントをpositiveかnegativeで答えてください。" - FEWSHOT_SEP = " [INST] " - - def doc_to_text(self, doc): - input_text = f"センテンス: {doc['sentence'].split('[SEP]')[-1].strip()}\nターゲット: {doc['target']}" - return f"{self.INSTRUCTION}\n\n{input_text} [/INST] " - - -VERSIONS = [ - ChabsaWithAnlpPrompt, - ChabsaWithFintanPrompt, - ChabsaWithAlpacaPrompt, - ChabsaWithRinnaInstructionSFT, - ChabsaWithRinnaBilingualInstructionSFT, - ChabsaWithLlama2, -] - - -def construct_tasks(): - tasks = {} - for version_class in VERSIONS: - tasks[ - f"chabsa-{version_class.VERSION}-{version_class.PROMPT_VERSION}" - ] = version_class - tasks["chabsa"] = Chabsa - return tasks diff --git a/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.1.yaml b/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.1.yaml new file mode 100644 index 0000000..9976033 --- /dev/null +++ b/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.1.yaml @@ -0,0 +1,4 @@ +include: chabsa.yaml +task: chabsa-1.0-0.1 +description: "[センテンス]における、[ターゲット]のセンチメントをpositiveかnegativeで選んでください。\n\n\n" +doc_to_text: "[センテンス]: {{sentence}}\nターゲット]: {{target}}\n[答え]:" diff --git a/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.2.yaml b/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.2.yaml new file mode 100644 index 0000000..6fc446e --- /dev/null +++ b/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.2.yaml @@ -0,0 +1,4 @@ +include: chabsa.yaml +task: chabsa-1.0-0.2 +description: "センテンスとターゲットを入力として受け取り、ターゲットに関するセンチメントをpositiveかnegativeから選択してください。\n\n\n" +doc_to_text: "センテンス: {{sentence}}\nターゲット: {{target}}\n回答:" diff --git a/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.3.yaml b/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.3.yaml new file mode 100644 index 0000000..2a85dad --- /dev/null +++ b/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.3.yaml @@ -0,0 +1,5 @@ +include: chabsa.yaml +task: chabsa-1.0-0.3 +description: "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n### 指示:\n以下のセンテンスにおける、ターゲットのセンチメントをpositiveかnegativeで答えてください。\n\n\n" +doc_to_text: "### 入力:\nセンテンス:{{sentence}}\nターゲット:{{target}}\n\n### 応答:\n\n" +target_delimiter: "" diff --git a/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.4.yaml b/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.4.yaml new file mode 100644 index 0000000..05b1249 --- /dev/null +++ b/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.4.yaml @@ -0,0 +1,5 @@ +include: chabsa.yaml +task: chabsa-1.0-0.4 +description: "ユーザー: センテンスにおける、ターゲットのセンチメントをpositiveかnegativeで答えてください。システム: 分かりました。" +fewshot_delimiter: "" +doc_to_text: "センテンス: {{sentence}}ターゲット: {{target}}システム:" diff --git a/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.5.yaml b/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.5.yaml new file mode 100644 index 0000000..e6e680c --- /dev/null +++ b/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.5.yaml @@ -0,0 +1,5 @@ +include: chabsa.yaml +task: chabsa-1.0-0.5 +description: "ユーザー: センテンスにおける、ターゲットのセンチメントをpositiveかnegativeで答えてください。\nシステム: 分かりました。\n\n" +fewshot_delimiter: "\n" +doc_to_text: "センテンス: {{sentence}}\nターゲット: {{target}}\nシステム:" diff --git a/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.6.yaml b/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.6.yaml new file mode 100644 index 0000000..a1fca1d --- /dev/null +++ b/jlm_fin_eval/tasks/chabsa/chabsa-1.0-0.6.yaml @@ -0,0 +1,5 @@ +include: chabsa.yaml +task: chabsa-1.0-0.6 +description: "[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<>\n\n\n" +fewshot_delimiter: " [INST] " +doc_to_text: "与えられたセンテンスにおける、ターゲットのセンチメントをpositiveかnegativeで答えてください。\n\nセンテンス: {{sentence}}\nターゲット: {{target}} [/INST]" diff --git a/jlm_fin_eval/tasks/chabsa/chabsa.yaml b/jlm_fin_eval/tasks/chabsa/chabsa.yaml new file mode 100644 index 0000000..5866284 --- /dev/null +++ b/jlm_fin_eval/tasks/chabsa/chabsa.yaml @@ -0,0 +1,32 @@ +task: chabsa +dataset_path: jlm_fin_eval/datasets/chabsa/chabsa.py +dataset_name: chABSA +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +description: "以下のセンテンスにおける、ターゲットのセンチメントをpositiveかnegativeで答えてください。\n\n\n" +fewshot_delimiter: "\n\n" +doc_to_text: "センテンス: {{sentence}}\nターゲット: {{target}}\n回答:" +doc_to_target: "{{polarity}}" +doc_to_choice: + - "positive" + # - "neutral" + - "negative" +should_decontaminate: false +doc_to_decontamination_query: sentence +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + aggregation: 2class_adjusted_macro_f1_score_for_chabsa + higher_is_better: true + - metric: f1_norm + aggregation: 2class_adjusted_macro_f1_score_for_chabsa + higher_is_better: true +metadata: + - version: 1.0 diff --git a/jlm_fin_eval/tasks/cma_basics.py b/jlm_fin_eval/tasks/cma_basics.py deleted file mode 100644 index f87bab1..0000000 --- a/jlm_fin_eval/tasks/cma_basics.py +++ /dev/null @@ -1,254 +0,0 @@ -import inspect -import os - -import numpy as np -from lm_eval.base import MultipleChoiceTask -from lm_eval.base import mean -from lm_eval.base import rf -from sklearn.metrics import accuracy_score - -import jlm_fin_eval.datasets.cma_basics.cma_basics - - -class CmaBasics(MultipleChoiceTask): - VERSION = 1.0 - DATASET_PATH = inspect.getfile(jlm_fin_eval.datasets.cma_basics.cma_basics) - DATASET_NAME = "cma_basics" - DESCRIPTION = "以下の問題の適切な答えを選択肢から選んでアルファベットで答えなさい。\n\n" - - def has_training_docs(self): - return False - - def has_validation_docs(self): - return False - - def has_test_docs(self): - return True - - def training_docs(self): - return None - - def validation_docs(self): - return None - - def test_docs(self): - return self.dataset["test"] - - def doc_to_text(self, doc): - doc_text = "【問題】\n" + doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - doc_text += doc["context"] + "\n" - doc_text += "\n【選択肢】\n" - for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): - doc_text += chr(choice_id + 65) + ": " + choice_text + "\n" - doc_text += "\n【答え】\n" - # doc_text += chr(int(doc["answer"]) + 65) - return doc_text - - def doc_to_target(self, doc): - answer = chr(int(doc["answer"]) + 65) - return answer - - @staticmethod - def compute_scores(gold, pred): - acc = accuracy_score(gold, pred) - - return {"acc": acc} - - def construct_requests(self, doc, ctx): - lls = [ - rf.loglikelihood(ctx, chr(choice + 65))[0] - for choice in doc["choices"]["id"] - ] - - return lls - - def process_results(self, doc, results): - gold = doc["answer"] - - acc = 1.0 if doc["choices"]["id"][np.argmax(results)] == gold else 0.0 - ranking = [doc["choices"]["id"][i] for i in np.argsort(results)[::-1]] - correct_answer_ranking = ranking.index(gold) + 1 - map_score = 1.0 / correct_answer_ranking - map_2 = 0.0 if correct_answer_ranking > 2 else map_score - map_3 = 0.0 if correct_answer_ranking > 3 else map_score - map_4 = 0.0 if correct_answer_ranking > 4 else map_score - - return { - "acc": acc, - "map": map_score, - "map_2": map_2, - "map_3": map_3, - "map_4": map_4, - } - - def higher_is_better(self): - return {"acc": True, "map": True, "map_2": True, "map_3": True, "map_4": True} - - def aggregation(self): - return { - "acc": mean, - "map": mean, - "map_2": mean, - "map_3": mean, - "map_4": mean, - } - - -class CmaBasicsWithAnlpPrompt(CmaBasics): - PROMPT_VERSION = 0.1 - DESCRIPTION = "[問題]に対する[答え]を[選択肢]の中から選んでください。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(doc['choices']['text'])}]\n[答え]:" - - def doc_to_target(self, doc): - return [ - choice_text - for choice_id, choice_text in zip( - doc["choices"]["id"], doc["choices"]["text"] - ) - if choice_id == doc["answer"] - ][0] - - def construct_requests(self, doc, ctx): - lls = [rf.loglikelihood(ctx, choice)[0] for choice in doc["choices"]["text"]] - - return lls - - -class CmaBasicsWithAnlpPromptAlphabet(CmaBasics): - PROMPT_VERSION = "0.1.2" - DESCRIPTION = "[問題]に対する[答え]を[選択肢]の中からアルファベットで選んでください。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choice_doc_text = [] - for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): - choice_doc_text.append(chr(choice_id + 65) + ":" + choice_text) - return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(choice_doc_text)}]\n[答え]:" - - -class CmaBasicsWithFintanPrompt(CmaBasics): - PROMPT_VERSION = 0.2 - DESCRIPTION = "質問と回答の選択肢を入力として受け取り、選択肢から回答を選択してください。なお、回答は選択肢の番号(例:0)でするものとします。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choices = ",".join( - [ - f"{idx}.{choice}" - for idx, choice in zip(doc["choices"]["id"], doc["choices"]["text"]) - ] - ) - return f"質問:{q_doc_text}" f"選択肢:{choices}\n" "回答:" - - def doc_to_target(self, doc): - return [ - str(choice_id) - for choice_id in doc["choices"]["id"] - if choice_id == doc["answer"] - ][0] - - def construct_requests(self, doc, ctx): - lls = [rf.loglikelihood(ctx, str(choice))[0] for choice in doc["choices"]["id"]] - - return lls - - -class CmaBasicsWithFintanPromptV1(CmaBasicsWithAnlpPrompt): - PROMPT_VERSION = "0.2.1" - DESCRIPTION = "与えられた選択肢の中から、最適な答えを選んでください。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) - return f"質問:{q_doc_text}選択肢:\n{choices}\n回答:" - - -class CmaBasicsWithAlpacaPrompt(CmaBasicsWithAnlpPrompt): - PROMPT_VERSION = 0.3 - DESCRIPTION = """以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。 - -### 指示: -与えられた選択肢の中から、最適な答えを選んでください。 - -""" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) - input_text = f"{q_doc_text}" + f"出力は以下から選択してください:\n{choices}" - return f"### 入力:\n{input_text}\n\n### 応答:\n" - - -class CmaBasicsWithRinnaInstructionSFT(CmaBasicsWithAnlpPrompt): - PROMPT_VERSION = 0.4 - DESCRIPTION = "ユーザー: 与えられた選択肢の中から、最適な答えを選んでください。システム: 分かりました。" - SEP = "" - FEWSHOT_SEP = "" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] - if doc["context"] and doc["context"] != "": - q_doc_text += "\n" + doc["context"] - choices = self.SEP.join([f"- {choice}" for choice in doc["choices"]["text"]]) - input_text = f"質問:{q_doc_text}{self.SEP}" + f"選択肢:{self.SEP}{choices}" - return f"ユーザー: {input_text}{self.SEP}システム: " - - -class CmaBasicsWithRinnaBilingualInstructionSFT(CmaBasicsWithRinnaInstructionSFT): - PROMPT_VERSION = 0.5 - DESCRIPTION = "ユーザー: 与えられた選択肢の中から、最適な答えを選んでください。\nシステム: 分かりました。\n" - SEP = "\n" - FEWSHOT_SEP = "\n" - - -class CmaBasicsWithLlama2(CmaBasicsWithAnlpPrompt): - PROMPT_VERSION = 0.6 - DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" - SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", DEFAULT_SYSTEM_PROMPT) - DESCRIPTION = f"[INST] <>\n{SYSTEM_PROMPT}\n<>\n\n" - INSTRUCTION = "与えられた選択肢の中から、最適な答えを選んでください。" - FEWSHOT_SEP = " [INST] " - - def doc_to_text(self, doc): - q_doc_text = doc["question"] - if doc["context"] and doc["context"] != "": - q_doc_text += "\n" + doc["context"] - choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) - input_text = f"質問:{q_doc_text}" + f"出力は以下から選択してください:\n{choices}" - return f"{self.INSTRUCTION}\n\n{input_text} [/INST] " - - -VERSIONS = [ - CmaBasicsWithAnlpPrompt, - CmaBasicsWithAnlpPromptAlphabet, - CmaBasicsWithFintanPrompt, - CmaBasicsWithFintanPromptV1, - CmaBasicsWithAlpacaPrompt, - CmaBasicsWithRinnaInstructionSFT, - CmaBasicsWithRinnaBilingualInstructionSFT, - CmaBasicsWithLlama2, -] - - -def construct_tasks(): - tasks = {} - for version_class in VERSIONS: - tasks[ - f"cma_basics-{version_class.VERSION}-{version_class.PROMPT_VERSION}" - ] = version_class - tasks["cma_basics"] = CmaBasics - return tasks diff --git a/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.1.2.yaml b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.1.2.yaml new file mode 100644 index 0000000..c120728 --- /dev/null +++ b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.1.2.yaml @@ -0,0 +1,4 @@ +include: cma_basics.yaml +task: cma_basics-1.0-0.1.2 +description: "[問題]に対する[答え]を[選択肢]の中からアルファベットで選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_01_2 diff --git a/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.1.yaml b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.1.yaml new file mode 100644 index 0000000..42955ee --- /dev/null +++ b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.1.yaml @@ -0,0 +1,6 @@ +include: cma_basics.yaml +task: cma_basics-1.0-0.1 +description: "[問題]に対する[答え]を[選択肢]の中から選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_01 +doc_to_target: !function utils.doc_to_target_01 +doc_to_choice: !function utils.doc_to_choices_01 diff --git a/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.2.1.yaml b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.2.1.yaml new file mode 100644 index 0000000..dd2676b --- /dev/null +++ b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.2.1.yaml @@ -0,0 +1,4 @@ +include: cma_basics-1.0-0.1.yaml +task: cma_basics-1.0-0.2.1 +description: "与えられた選択肢の中から、最適な答えを選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_02_1 diff --git a/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.2.yaml b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.2.yaml new file mode 100644 index 0000000..10c49a1 --- /dev/null +++ b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.2.yaml @@ -0,0 +1,6 @@ +include: cma_basics.yaml +task: cma_basics-1.0-0.2 +description: "質問と回答の選択肢を入力として受け取り、選択肢から回答を選択してください。なお、回答は選択肢の番号(例:0)でするものとします。\n\n\n" +doc_to_text: !function utils.doc_to_text_02 +doc_to_target: !function utils.doc_to_target_02 +doc_to_choice: !function utils.doc_to_choices_02 diff --git a/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.3.yaml b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.3.yaml new file mode 100644 index 0000000..23319f2 --- /dev/null +++ b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.3.yaml @@ -0,0 +1,4 @@ +include: cma_basics-1.0-0.1.yaml +task: cma_basics-1.0-0.3 +description: "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n### 指示:\n与えられた選択肢の中から、最適な答えを選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_03 diff --git a/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.4.yaml b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.4.yaml new file mode 100644 index 0000000..3ce1ae1 --- /dev/null +++ b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.4.yaml @@ -0,0 +1,5 @@ +include: cma_basics-1.0-0.1.yaml +task: cma_basics-1.0-0.4 +description: "ユーザー: 与えられた選択肢の中から、最適な答えを選んでください。システム: 分かりました。" +fewshot_delimiter: "" +doc_to_text: !function utils.doc_to_text_04 diff --git a/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.5.yaml b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.5.yaml new file mode 100644 index 0000000..1c4d6c8 --- /dev/null +++ b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.5.yaml @@ -0,0 +1,5 @@ +include: cma_basics-1.0-0.4.yaml +task: cma_basics-1.0-0.5 +description: "ユーザー: 与えられた選択肢の中から、最適な答えを選んでください。\nシステム: 分かりました。\n\n" +fewshot_delimiter: "\n" +doc_to_text: !function utils.doc_to_text_05 diff --git a/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.6.yaml b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.6.yaml new file mode 100644 index 0000000..fcb9e71 --- /dev/null +++ b/jlm_fin_eval/tasks/cma_basics/cma_basics-1.0-0.6.yaml @@ -0,0 +1,5 @@ +include: cma_basics-1.0-0.1.yaml +task: cma_basics-1.0-0.6 +description: "[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<>\n\n\n" +fewshot_delimiter: " [INST] " +doc_to_text: !function utils.doc_to_text_06 diff --git a/jlm_fin_eval/tasks/cma_basics/cma_basics.yaml b/jlm_fin_eval/tasks/cma_basics/cma_basics.yaml new file mode 100644 index 0000000..630f40e --- /dev/null +++ b/jlm_fin_eval/tasks/cma_basics/cma_basics.yaml @@ -0,0 +1,48 @@ +task: cma_basics +dataset_path: jlm_fin_eval/datasets/cma_basics/cma_basics.py +dataset_name: cma_basics +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +description: "以下の問題の適切な答えを選択肢から選んでアルファベットで答えなさい。\n\n\n" +target_delimiter: "" +fewshot_delimiter: "\n\n" +doc_to_text: !function utils.doc_to_text_00 +doc_to_target: !function utils.doc_to_target_alphabet +doc_to_choice: !function utils.doc_to_choices_alphabet +should_decontaminate: true +doc_to_decontamination_query: sentence +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: map + aggregation: mean + higher_is_better: true + - metric: map_2 + aggregation: mean + higher_is_better: true + - metric: map_3 + aggregation: mean + higher_is_better: true + - metric: map_4 + aggregation: mean + higher_is_better: true + - metric: map_norm + aggregation: mean + higher_is_better: true + - metric: map_2_norm + aggregation: mean + higher_is_better: true + - metric: map_3_norm + aggregation: mean + higher_is_better: true + - metric: map_4_norm + aggregation: mean + higher_is_better: true +metadata: + - version: 1.0 diff --git a/jlm_fin_eval/tasks/cma_basics/utils.py b/jlm_fin_eval/tasks/cma_basics/utils.py new file mode 100644 index 0000000..9c45789 --- /dev/null +++ b/jlm_fin_eval/tasks/cma_basics/utils.py @@ -0,0 +1,115 @@ +def doc_to_text_00(doc): + doc_text = "【問題】\n" + doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + doc_text += doc["context"] + "\n" + doc_text += "\n【選択肢】\n" + for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): + doc_text += chr(choice_id + 65) + ": " + choice_text + "\n" + doc_text += "\n【答え】\n" + # doc_text += chr(int(doc["answer"]) + 65) + return doc_text + + +def doc_to_target_alphabet(doc): + answer = chr(int(doc["answer"]) + 65) + return answer + + +def doc_to_choices_alphabet(doc): + choices = [chr(choice_id + 65) for choice_id in doc["choices"]["id"]] + return choices + + +def doc_to_text_01(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(doc['choices']['text'])}]\n[答え]:" + + +def doc_to_target_01(doc): + return [ + choice_text + for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]) + if choice_id == doc["answer"] + ][0] + + +def doc_to_choices_01(doc): + choices = doc["choices"]["text"] + return choices + + +def doc_to_text_01_2(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choice_doc_text = [] + for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): + choice_doc_text.append(chr(choice_id + 65) + ":" + choice_text) + return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(choice_doc_text)}]\n[答え]:" + + +def doc_to_text_02(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choices = ",".join( + [ + f"{idx}.{choice}" + for idx, choice in zip(doc["choices"]["id"], doc["choices"]["text"]) + ] + ) + return f"質問:{q_doc_text}" f"選択肢:{choices}\n" "回答:" + + +def doc_to_target_02(doc): + return [ + str(choice_id) + for choice_id in doc["choices"]["id"] + if choice_id == doc["answer"] + ][0] + + +def doc_to_choices_02(doc): + return [str(choice) for choice in doc["choices"]["id"]] + + +def doc_to_text_02_1(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) + return f"質問:{q_doc_text}選択肢:\n{choices}\n回答:" + + +def doc_to_text_03(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) + input_text = f"{q_doc_text}" + f"出力は以下から選択してください:\n{choices}" + return f"### 入力:\n{input_text}\n\n### 応答:\n" + + +def doc_to_text_04(doc, SEP=""): + q_doc_text = doc["question"] + if doc["context"] and doc["context"] != "": + q_doc_text += "\n" + doc["context"] + choices = SEP.join([f"- {choice}" for choice in doc["choices"]["text"]]) + input_text = f"質問:{q_doc_text}{SEP}" + f"選択肢:{SEP}{choices}" + return f"ユーザー: {input_text}{SEP}システム: " + + +def doc_to_text_05(doc): + return doc_to_text_04(doc, SEP="\n") + + +def doc_to_text_06(doc): + INSTRUCTION = "与えられた選択肢の中から、最適な答えを選んでください。" + q_doc_text = doc["question"] + if doc["context"] and doc["context"] != "": + q_doc_text += "\n" + doc["context"] + choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) + input_text = f"質問:{q_doc_text}" + f"出力は以下から選択してください:\n{choices}" + return f"{INSTRUCTION}\n\n{input_text} [/INST] " diff --git a/jlm_fin_eval/tasks/cpa_audit.py b/jlm_fin_eval/tasks/cpa_audit.py deleted file mode 100644 index b1183a3..0000000 --- a/jlm_fin_eval/tasks/cpa_audit.py +++ /dev/null @@ -1,259 +0,0 @@ -import inspect -import os - -import numpy as np -from lm_eval.base import MultipleChoiceTask -from lm_eval.base import mean -from lm_eval.base import rf -from sklearn.metrics import accuracy_score - -import jlm_fin_eval.datasets.cpa.cpa_audit - - -class CpaAudit(MultipleChoiceTask): - VERSION = 1.0 - DATASET_PATH = inspect.getfile(jlm_fin_eval.datasets.cpa.cpa_audit) - DATASET_NAME = "cpa_audit" - DESCRIPTION = "以下の問題の答えとして適切な記号の組み合わせを選択肢から選んでアルファベットで答えなさい。\n\n" - - def has_training_docs(self): - return False - - def has_validation_docs(self): - return False - - def has_test_docs(self): - return True - - def training_docs(self): - return None - - def validation_docs(self): - return None - - def test_docs(self): - return self.dataset["test"] - - def doc_to_text(self, doc): - doc_text = "【問題】\n" + doc["question"] + "\n" - - if doc["context"] and doc["context"] != "": - doc_text += doc["context"] + "\n" - doc_text += "\n【選択肢】\n" - for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): - doc_text += chr(choice_id + 65) + ": " + choice_text + "\n" - doc_text += "\n【答え】\n" - # doc_text += chr(int(doc["answer"]) + 65) - return doc_text - - def doc_to_target(self, doc): - answer = chr(int(doc["answer"]) + 65) - return answer - - @staticmethod - def get_answer(doc): - return chr(int(doc["answer"]) + 65) - - @staticmethod - def compute_scores(gold, pred): - acc = accuracy_score(gold, pred) - - return {"acc": acc} - - def construct_requests(self, doc, ctx): - lls = [ - rf.loglikelihood(ctx, "{}".format(chr(choice + 65)))[0] - for choice in doc["choices"]["id"] - ] - - return lls - - def process_results(self, doc, results): - gold = doc["answer"] - - acc = 1.0 if doc["choices"]["id"][np.argmax(results)] == gold else 0.0 - ranking = [doc["choices"]["id"][i] for i in np.argsort(results)[::-1]] - correct_answer_ranking = ranking.index(gold) + 1 - map_score = 1.0 / correct_answer_ranking - map_2 = 0.0 if correct_answer_ranking > 2 else map_score - map_3 = 0.0 if correct_answer_ranking > 3 else map_score - map_4 = 0.0 if correct_answer_ranking > 4 else map_score - - return { - "acc": acc, - "map": map_score, - "map_2": map_2, - "map_3": map_3, - "map_4": map_4, - } - - def higher_is_better(self): - return {"acc": True, "map": True, "map_2": True, "map_3": True, "map_4": True} - - def aggregation(self): - return { - "acc": mean, - "map": mean, - "map_2": mean, - "map_3": mean, - "map_4": mean, - } - - -class CpaAuditWithAnlpPrompt(CpaAudit): - PROMPT_VERSION = 0.1 - DESCRIPTION = "[問題]に対する[答え]をとして適切な記号の組み合わせを[選択肢]の中から選んでください。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(doc['choices']['text'])}]\n[答え]:" - - def doc_to_target(self, doc): - return [ - choice_text - for choice_id, choice_text in zip( - doc["choices"]["id"], doc["choices"]["text"] - ) - if choice_id == doc["answer"] - ][0] - - def construct_requests(self, doc, ctx): - lls = [rf.loglikelihood(ctx, choice)[0] for choice in doc["choices"]["text"]] - - return lls - - -class CpaAuditWithAnlpPromptAlphabet(CpaAudit): - PROMPT_VERSION = "0.1.2" - DESCRIPTION = "[問題]に対する[答え]として適切な記号の組み合わせを[選択肢]の中からアルファベットで選んでください。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choice_doc_text = [] - for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): - choice_doc_text.append(chr(choice_id + 65) + ":" + choice_text) - return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(choice_doc_text)}]\n[答え]:" - - -class CpaAuditWithFintanPrompt(CpaAudit): - PROMPT_VERSION = 0.2 - DESCRIPTION = "質問とその答えとして適切な記号の組み合わせの選択肢を入力として受け取り、選択肢から回答を選択してください。なお、回答は選択肢の番号(例:0)でするものとします。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choices = ",".join( - [ - f"{idx}.{choice}" - for idx, choice in zip(doc["choices"]["id"], doc["choices"]["text"]) - ] - ) - return f"質問:{q_doc_text}" f"選択肢:{choices}\n" "回答:" - - def doc_to_target(self, doc): - return [ - str(choice_id) - for choice_id in doc["choices"]["id"] - if choice_id == doc["answer"] - ][0] - - def construct_requests(self, doc, ctx): - lls = [rf.loglikelihood(ctx, str(choice))[0] for choice in doc["choices"]["id"]] - - return lls - - -class CpaAuditWithFintanPromptV1(CpaAuditWithAnlpPrompt): - PROMPT_VERSION = "0.2.1" - DESCRIPTION = "与えられた答えの組み合わせの選択肢の中から、最適な選択肢を選んでください。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) - return f"質問:{q_doc_text}選択肢:\n{choices}\n回答:" - - -class CpaAuditWithAlpacaPrompt(CpaAuditWithAnlpPrompt): - PROMPT_VERSION = 0.3 - DESCRIPTION = """以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。 - -### 指示: -与えられた答えの組み合わせの選択肢の中から、最適な選択肢を選んでください。 - -""" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) - input_text = f"{q_doc_text}" + f"出力は以下から選択してください:\n{choices}" - return f"### 入力:\n{input_text}\n\n### 応答:\n" - - -class CpaAuditWithRinnaInstructionSFT(CpaAuditWithAnlpPrompt): - PROMPT_VERSION = 0.4 - DESCRIPTION = "ユーザー: 与えられた答えの組み合わせの選択肢の中から、最適な選択肢を選んでください。システム: 分かりました。" - SEP = "" - FEWSHOT_SEP = "" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] - if doc["context"] and doc["context"] != "": - q_doc_text += "\n" + doc["context"] - choices = self.SEP.join([f"- {choice}" for choice in doc["choices"]["text"]]) - input_text = f"質問:{q_doc_text}{self.SEP}" + f"選択肢:{self.SEP}{choices}" - return f"ユーザー: {input_text}{self.SEP}システム: " - - -class CpaAuditWithRinnaBilingualInstructionSFT(CpaAuditWithRinnaInstructionSFT): - PROMPT_VERSION = 0.5 - DESCRIPTION = "ユーザー: 与えられた答えの組み合わせの選択肢の中から、最適な選択肢を選んでください。\nシステム: 分かりました。\n" - SEP = "\n" - FEWSHOT_SEP = "\n" - - -class CpaAuditWithLlama2(CpaAuditWithAnlpPrompt): - PROMPT_VERSION = 0.6 - DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" - SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", DEFAULT_SYSTEM_PROMPT) - DESCRIPTION = f"[INST] <>\n{SYSTEM_PROMPT}\n<>\n\n" - INSTRUCTION = "与えられた答えの組み合わせの選択肢の中から、最適な選択肢を選んでください。" - FEWSHOT_SEP = " [INST] " - - def doc_to_text(self, doc): - q_doc_text = doc["question"] - if doc["context"] and doc["context"] != "": - q_doc_text += "\n" + doc["context"] - choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) - input_text = f"質問:{q_doc_text}" + f"出力は以下から選択してください:\n{choices}" - return f"{self.INSTRUCTION}\n\n{input_text} [/INST] " - - -VERSIONS = [ - CpaAuditWithAnlpPrompt, - CpaAuditWithAnlpPromptAlphabet, - CpaAuditWithFintanPrompt, - CpaAuditWithFintanPromptV1, - CpaAuditWithAlpacaPrompt, - CpaAuditWithRinnaInstructionSFT, - CpaAuditWithRinnaBilingualInstructionSFT, - CpaAuditWithLlama2, -] - - -def construct_tasks(): - tasks = {} - for version_class in VERSIONS: - tasks[ - f"cpa_audit-{version_class.VERSION}-{version_class.PROMPT_VERSION}" - ] = version_class - tasks["cpa_audit"] = CpaAudit - return tasks diff --git a/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.1.2.yaml b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.1.2.yaml new file mode 100644 index 0000000..22eb2f4 --- /dev/null +++ b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.1.2.yaml @@ -0,0 +1,4 @@ +include: cpa_audit.yaml +task: cpa_audit-1.0-0.1.2 +description: "[問題]に対する[答え]として適切な記号の組み合わせを[選択肢]の中からアルファベットで選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_01_2 diff --git a/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.1.yaml b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.1.yaml new file mode 100644 index 0000000..c0c57af --- /dev/null +++ b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.1.yaml @@ -0,0 +1,6 @@ +include: cpa_audit.yaml +task: cpa_audit-1.0-0.1 +description: "[問題]に対する[答え]をとして適切な記号の組み合わせを[選択肢]の中から選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_01 +doc_to_target: !function utils.doc_to_target_01 +doc_to_choice: !function utils.doc_to_choices_01 diff --git a/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.2.1.yaml b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.2.1.yaml new file mode 100644 index 0000000..ccac2c6 --- /dev/null +++ b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.2.1.yaml @@ -0,0 +1,4 @@ +include: cpa_audit-1.0-0.1.yaml +task: cpa_audit-1.0-0.2.1 +description: "与えられた答えの組み合わせの選択肢の中から、最適な選択肢を選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_02_1 diff --git a/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.2.yaml b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.2.yaml new file mode 100644 index 0000000..c52cfc0 --- /dev/null +++ b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.2.yaml @@ -0,0 +1,6 @@ +include: cpa_audit.yaml +task: cpa_audit-1.0-0.2 +description: "質問とその答えとして適切な記号の組み合わせの選択肢を入力として受け取り、選択肢から回答を選択してください。なお、回答は選択肢の番号(例:0)でするものとします。\n\n\n" +doc_to_text: !function utils.doc_to_text_02 +doc_to_target: !function utils.doc_to_target_02 +doc_to_choice: !function utils.doc_to_choices_02 diff --git a/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.3.yaml b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.3.yaml new file mode 100644 index 0000000..53ad700 --- /dev/null +++ b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.3.yaml @@ -0,0 +1,4 @@ +include: cpa_audit-1.0-0.1.yaml +task: cpa_audit-1.0-0.3 +description: "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n### 指示:\n与えられた答えの組み合わせの選択肢の中から、最適な選択肢を選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_03 diff --git a/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.4.yaml b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.4.yaml new file mode 100644 index 0000000..31da04f --- /dev/null +++ b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.4.yaml @@ -0,0 +1,5 @@ +include: cpa_audit-1.0-0.1.yaml +task: cpa_audit-1.0-0.4 +description: "ユーザー: 与えられた答えの組み合わせの選択肢の中から、最適な選択肢を選んでください。システム: 分かりました。" +fewshot_delimiter: "" +doc_to_text: !function utils.doc_to_text_04 diff --git a/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.5.yaml b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.5.yaml new file mode 100644 index 0000000..d037b54 --- /dev/null +++ b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.5.yaml @@ -0,0 +1,5 @@ +include: cpa_audit-1.0-0.4.yaml +task: cpa_audit-1.0-0.5 +description: "ユーザー: 与えられた答えの組み合わせの選択肢の中から、最適な選択肢を選んでください。\nシステム: 分かりました。\n\n" +fewshot_delimiter: "\n" +doc_to_text: !function utils.doc_to_text_05 diff --git a/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.6.yaml b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.6.yaml new file mode 100644 index 0000000..361836d --- /dev/null +++ b/jlm_fin_eval/tasks/cpa_audit/cpa_audit-1.0-0.6.yaml @@ -0,0 +1,5 @@ +include: cpa_audit-1.0-0.1.yaml +task: cpa_audit-1.0-0.6 +description: "[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<>\n\n\n" +fewshot_delimiter: " [INST] " +doc_to_text: !function utils.doc_to_text_06 diff --git a/jlm_fin_eval/tasks/cpa_audit/cpa_audit.yaml b/jlm_fin_eval/tasks/cpa_audit/cpa_audit.yaml new file mode 100644 index 0000000..84485c7 --- /dev/null +++ b/jlm_fin_eval/tasks/cpa_audit/cpa_audit.yaml @@ -0,0 +1,48 @@ +task: cpa_audit +dataset_path: jlm_fin_eval/datasets/cpa/cpa_audit.py +dataset_name: cpa_audit +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +description: "以下の問題の答えとして適切な記号の組み合わせを選択肢から選んでアルファベットで答えなさい。\n\n\n" +target_delimiter: "" +fewshot_delimiter: "\n\n" +doc_to_text: !function utils.doc_to_text_00 +doc_to_target: !function utils.doc_to_target_alphabet +doc_to_choice: !function utils.doc_to_choices_alphabet +should_decontaminate: true +doc_to_decontamination_query: sentence +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: map + aggregation: mean + higher_is_better: true + - metric: map_2 + aggregation: mean + higher_is_better: true + - metric: map_3 + aggregation: mean + higher_is_better: true + - metric: map_4 + aggregation: mean + higher_is_better: true + - metric: map_norm + aggregation: mean + higher_is_better: true + - metric: map_2_norm + aggregation: mean + higher_is_better: true + - metric: map_3_norm + aggregation: mean + higher_is_better: true + - metric: map_4_norm + aggregation: mean + higher_is_better: true +metadata: + - version: 1.0 diff --git a/jlm_fin_eval/tasks/cpa_audit/utils.py b/jlm_fin_eval/tasks/cpa_audit/utils.py new file mode 100644 index 0000000..4cabe66 --- /dev/null +++ b/jlm_fin_eval/tasks/cpa_audit/utils.py @@ -0,0 +1,115 @@ +def doc_to_text_00(doc): + doc_text = "【問題】\n" + doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + doc_text += doc["context"] + "\n" + doc_text += "\n【選択肢】\n" + for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): + doc_text += chr(choice_id + 65) + ": " + choice_text + "\n" + doc_text += "\n【答え】\n" + # doc_text += chr(int(doc["answer"]) + 65) + return doc_text + + +def doc_to_target_alphabet(doc): + answer = chr(int(doc["answer"]) + 65) + return answer + + +def doc_to_choices_alphabet(doc): + choices = [chr(choice_id + 65) for choice_id in doc["choices"]["id"]] + return choices + + +def doc_to_text_01(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(doc['choices']['text'])}]\n[答え]:" + + +def doc_to_target_01(doc): + return [ + choice_text + for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]) + if choice_id == doc["answer"] + ][0] + + +def doc_to_choices_01(doc): + choices = doc["choices"]["text"] + return choices + + +def doc_to_text_01_2(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choice_doc_text = [] + for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): + choice_doc_text.append(chr(choice_id + 65) + ":" + choice_text) + return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(choice_doc_text)}]\n[答え]:" + + +def doc_to_text_02(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choices = ",".join( + [ + f"{idx}.{choice}" + for idx, choice in zip(doc["choices"]["id"], doc["choices"]["text"]) + ] + ) + return f"質問:{q_doc_text}" f"選択肢:{choices}\n" "回答:" + + +def doc_to_target_02(doc): + return [ + str(choice_id) + for choice_id in doc["choices"]["id"] + if choice_id == doc["answer"] + ][0] + + +def doc_to_choices_02(doc): + return [str(choice) for choice in doc["choices"]["id"]] + + +def doc_to_text_02_1(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) + return f"質問:{q_doc_text}選択肢:\n{choices}\n回答:" + + +def doc_to_text_03(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) + input_text = f"{q_doc_text}" + f"出力は以下から選択してください:\n{choices}" + return f"### 入力:\n{input_text}\n\n### 応答:\n" + + +def doc_to_text_04(doc, SEP=""): + q_doc_text = doc["question"] + if doc["context"] and doc["context"] != "": + q_doc_text += "\n" + doc["context"] + choices = SEP.join([f"- {choice}" for choice in doc["choices"]["text"]]) + input_text = f"質問:{q_doc_text}{SEP}" + f"選択肢:{SEP}{choices}" + return f"ユーザー: {input_text}{SEP}システム: " + + +def doc_to_text_05(doc): + return doc_to_text_04(doc, SEP="\n") + + +def doc_to_text_06(doc): + INSTRUCTION = "与えられた答えの組み合わせの選択肢の中から、最適な選択肢を選んでください。" + q_doc_text = doc["question"] + if doc["context"] and doc["context"] != "": + q_doc_text += "\n" + doc["context"] + choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) + input_text = f"質問:{q_doc_text}" + f"出力は以下から選択してください:\n{choices}" + return f"{INSTRUCTION}\n\n{input_text} [/INST] " diff --git a/jlm_fin_eval/tasks/fp2.py b/jlm_fin_eval/tasks/fp2.py deleted file mode 100644 index fb9bdf2..0000000 --- a/jlm_fin_eval/tasks/fp2.py +++ /dev/null @@ -1,256 +0,0 @@ -import inspect -import os - -import numpy as np -from lm_eval.base import MultipleChoiceTask -from lm_eval.base import mean -from lm_eval.base import rf -from sklearn.metrics import accuracy_score - -import jlm_fin_eval.datasets.fp2.fp2 - - -class FP2(MultipleChoiceTask): - VERSION = 1.0 - DATASET_PATH = inspect.getfile(jlm_fin_eval.datasets.fp2.fp2) - DATASET_NAME = "fp2" - DESCRIPTION = "以下の問題の適切な答えを選択肢から選んで1~4の数字で答えなさい。\n\n" - - def has_training_docs(self): - return False - - def has_validation_docs(self): - return False - - def has_test_docs(self): - return True - - def training_docs(self): - return None - - def validation_docs(self): - return None - - def test_docs(self): - return self.dataset["test"] - - def doc_to_text(self, doc): - doc_text = "【問題】\n" + doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - doc_text += doc["context"] + "\n" - doc_text += "\n【答え】\n" - # doc_text += chr(int(doc["answer"]) + 65) - return doc_text - - def doc_to_target(self, doc): - answer = doc["choices"]["text"][doc["choices"]["id"].index(doc["answer"])] - return answer - - @staticmethod - def compute_scores(gold, pred): - acc = accuracy_score(gold, pred) - - return {"acc": acc} - - def construct_requests(self, doc, ctx): - lls = [rf.loglikelihood(ctx, choice)[0] for choice in doc["choices"]["text"]] - - return lls - - def process_results(self, doc, results): - gold = doc["answer"] - - acc = 1.0 if doc["choices"]["id"][np.argmax(results)] == gold else 0.0 - ranking = [doc["choices"]["id"][i] for i in np.argsort(results)[::-1]] - correct_answer_ranking = ranking.index(gold) + 1 - map_score = 1.0 / correct_answer_ranking - map_2 = 0.0 if correct_answer_ranking > 2 else map_score - map_3 = 0.0 if correct_answer_ranking > 3 else map_score - map_4 = 0.0 if correct_answer_ranking > 4 else map_score - - return { - "acc": acc, - "map": map_score, - "map_2": map_2, - "map_3": map_3, - "map_4": map_4, - } - - def higher_is_better(self): - return {"acc": True, "map": True, "map_2": True, "map_3": True, "map_4": True} - - def aggregation(self): - return { - "acc": mean, - "map": mean, - "map_2": mean, - "map_3": mean, - "map_4": mean, - } - - -class FP2WithAnlpPrompt(FP2): - PROMPT_VERSION = 0.1 - DESCRIPTION = "[問題]に対する[答え]を[選択肢]の中から選んでください。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(doc['choices']['text'])}]\n[答え]:" - - def doc_to_target(self, doc): - return [ - choice_text - for choice_id, choice_text in zip( - doc["choices"]["id"], doc["choices"]["text"] - ) - if choice_id == doc["answer"] - ][0] - - def construct_requests(self, doc, ctx): - lls = [rf.loglikelihood(ctx, choice)[0] for choice in doc["choices"]["text"]] - - return lls - - -class FP2WithAnlpPromptAlphabet(FP2): - PROMPT_VERSION = "0.1.2" - DESCRIPTION = "[問題]に対する[答え]を[選択肢]の中からアルファベットで選んでください。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choice_doc_text = [] - for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): - choice_doc_text.append(chr(choice_id + 65) + ":" + choice_text) - return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(choice_doc_text)}]\n[答え]:" - - def doc_to_target(self, doc): - answer = chr(doc["choices"]["id"].index(doc["answer"]) + 65) - return answer - - def construct_requests(self, doc, ctx): - lls = [ - rf.loglikelihood(ctx, chr(i + 65))[0] - for i, _ in enumerate(doc["choices"]["id"]) - ] - - return lls - - -class FP2WithFintanPrompt(FP2): - PROMPT_VERSION = 0.2 - DESCRIPTION = "質問と回答の選択肢を入力として受け取り、選択肢から回答を選択してください。なお、回答は選択肢の番号(例:0)でするものとします。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choices = ",".join( - [ - f"{idx}.{choice}" - for idx, choice in zip(doc["choices"]["id"], doc["choices"]["text"]) - ] - ) - return f"質問:{q_doc_text}" f"選択肢:{choices}\n" "回答:" - - def doc_to_target(self, doc): - return [ - str(choice_id) - for choice_id in doc["choices"]["id"] - if choice_id == doc["answer"] - ][0] - - def construct_requests(self, doc, ctx): - lls = [rf.loglikelihood(ctx, str(choice))[0] for choice in doc["choices"]["id"]] - - return lls - - -class FP2WithFintanPromptV1(FP2WithAnlpPrompt): - PROMPT_VERSION = "0.2.1" - DESCRIPTION = "与えられた選択肢の中から、最適な答えを選んでください。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - return f"質問:{q_doc_text}\n回答:" - - -class FP2WithAlpacaPrompt(FP2WithAnlpPrompt): - PROMPT_VERSION = 0.3 - DESCRIPTION = """以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。 - -### 指示: -与えられた選択肢の中から、最適な答えを選んでください。 - -""" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - input_text = f"{q_doc_text}" - return f"### 入力:\n{input_text}\n### 応答:\n" - - -class FP2WithRinnaInstructionSFT(FP2WithAnlpPrompt): - PROMPT_VERSION = 0.4 - DESCRIPTION = "ユーザー: 与えられた選択肢の中から、最適な答えを選んでください。システム: 分かりました。" - SEP = "" - FEWSHOT_SEP = "" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] - if doc["context"] and doc["context"] != "": - q_doc_text += "\n" + doc["context"] - input_text = f"{q_doc_text}" - return f"ユーザー: {input_text}{self.SEP}システム: " - - -class FP2WithRinnaBilingualInstructionSFT(FP2WithRinnaInstructionSFT): - PROMPT_VERSION = 0.5 - DESCRIPTION = "ユーザー: 与えられた選択肢の中から、最適な答えを選んでください。\nシステム: 分かりました。\n" - SEP = "\n" - FEWSHOT_SEP = "\n" - - -class FP2WithLlama2(FP2WithAnlpPrompt): - PROMPT_VERSION = 0.6 - DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" - SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", DEFAULT_SYSTEM_PROMPT) - DESCRIPTION = f"[INST] <>\n{SYSTEM_PROMPT}\n<>\n\n" - INSTRUCTION = "与えられた選択肢の中から、最適な答えを選んでください。" - FEWSHOT_SEP = " [INST] " - - def doc_to_text(self, doc): - q_doc_text = doc["question"] - if doc["context"] and doc["context"] != "": - q_doc_text += "\n" + doc["context"] - input_text = f"質問:{q_doc_text}" - return f"{self.INSTRUCTION}\n\n{input_text} [/INST] " - - -VERSIONS = [ - FP2WithAnlpPrompt, - FP2WithAnlpPromptAlphabet, - FP2WithFintanPrompt, - FP2WithFintanPromptV1, - FP2WithAlpacaPrompt, - FP2WithRinnaInstructionSFT, - FP2WithRinnaBilingualInstructionSFT, - FP2WithLlama2, -] - - -def construct_tasks(): - tasks = {} - for version_class in VERSIONS: - tasks[ - f"fp2-{version_class.VERSION}-{version_class.PROMPT_VERSION}" - ] = version_class - tasks["fp2"] = FP2 - return tasks diff --git a/jlm_fin_eval/tasks/fp2/fp2-1.0-0.1.2.yaml b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.1.2.yaml new file mode 100644 index 0000000..0512c70 --- /dev/null +++ b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.1.2.yaml @@ -0,0 +1,6 @@ +include: fp2.yaml +task: fp2-1.0-0.1.2 +description: "[問題]に対する[答え]を[選択肢]の中からアルファベットで選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_01_2 +doc_to_target: !function utils.doc_to_target_01_2 +doc_to_choice: !function utils.doc_to_choices_01_2 diff --git a/jlm_fin_eval/tasks/fp2/fp2-1.0-0.1.yaml b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.1.yaml new file mode 100644 index 0000000..6fc58a5 --- /dev/null +++ b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.1.yaml @@ -0,0 +1,4 @@ +include: fp2.yaml +task: fp2-1.0-0.1 +description: "[問題]に対する[答え]を[選択肢]の中から選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_01 diff --git a/jlm_fin_eval/tasks/fp2/fp2-1.0-0.2.1.yaml b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.2.1.yaml new file mode 100644 index 0000000..007958e --- /dev/null +++ b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.2.1.yaml @@ -0,0 +1,4 @@ +include: fp2-1.0-0.1.yaml +task: fp2-1.0-0.2.1 +description: "与えられた選択肢の中から、最適な答えを選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_02_1 diff --git a/jlm_fin_eval/tasks/fp2/fp2-1.0-0.2.yaml b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.2.yaml new file mode 100644 index 0000000..3e48089 --- /dev/null +++ b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.2.yaml @@ -0,0 +1,6 @@ +include: fp2.yaml +task: fp2-1.0-0.2 +description: "質問と回答の選択肢を入力として受け取り、選択肢から回答を選択してください。なお、回答は選択肢の番号(例:0)でするものとします。\n\n\n" +doc_to_text: !function utils.doc_to_text_02 +doc_to_target: !function utils.doc_to_target_02 +doc_to_choice: !function utils.doc_to_choices_02 diff --git a/jlm_fin_eval/tasks/fp2/fp2-1.0-0.3.yaml b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.3.yaml new file mode 100644 index 0000000..ea83cee --- /dev/null +++ b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.3.yaml @@ -0,0 +1,4 @@ +include: fp2-1.0-0.1.yaml +task: fp2-1.0-0.3 +description: "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n### 指示:\n与えられた選択肢の中から、最適な答えを選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_03 diff --git a/jlm_fin_eval/tasks/fp2/fp2-1.0-0.4.yaml b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.4.yaml new file mode 100644 index 0000000..3ca9dc4 --- /dev/null +++ b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.4.yaml @@ -0,0 +1,5 @@ +include: fp2-1.0-0.1.yaml +task: fp2-1.0-0.4 +description: "ユーザー: 与えられた選択肢の中から、最適な答えを選んでください。システム: 分かりました。" +fewshot_delimiter: "" +doc_to_text: !function utils.doc_to_text_04 diff --git a/jlm_fin_eval/tasks/fp2/fp2-1.0-0.5.yaml b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.5.yaml new file mode 100644 index 0000000..a019302 --- /dev/null +++ b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.5.yaml @@ -0,0 +1,5 @@ +include: fp2-1.0-0.4.yaml +task: fp2-1.0-0.5 +description: "ユーザー: 与えられた選択肢の中から、最適な答えを選んでください。\nシステム: 分かりました。\n\n" +fewshot_delimiter: "\n" +doc_to_text: !function utils.doc_to_text_05 diff --git a/jlm_fin_eval/tasks/fp2/fp2-1.0-0.6.yaml b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.6.yaml new file mode 100644 index 0000000..87b8c85 --- /dev/null +++ b/jlm_fin_eval/tasks/fp2/fp2-1.0-0.6.yaml @@ -0,0 +1,5 @@ +include: fp2-1.0-0.1.yaml +task: fp2-1.0-0.6 +description: "[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<>\n\n\n" +fewshot_delimiter: " [INST] " +doc_to_text: !function utils.doc_to_text_06 diff --git a/jlm_fin_eval/tasks/fp2/fp2.yaml b/jlm_fin_eval/tasks/fp2/fp2.yaml new file mode 100644 index 0000000..172445b --- /dev/null +++ b/jlm_fin_eval/tasks/fp2/fp2.yaml @@ -0,0 +1,48 @@ +task: fp2 +dataset_path: jlm_fin_eval/datasets/fp2/fp2.py +dataset_name: fp2 +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +description: "以下の問題の適切な答えを選択肢から選んで1~4の数字で答えなさい。\n\n\n" +target_delimiter: "" +fewshot_delimiter: "\n\n" +doc_to_text: !function utils.doc_to_text_00 +doc_to_target: !function utils.doc_to_target_00 +doc_to_choice: !function utils.doc_to_choices_00 +should_decontaminate: true +doc_to_decontamination_query: sentence +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: map + aggregation: mean + higher_is_better: true + - metric: map_2 + aggregation: mean + higher_is_better: true + - metric: map_3 + aggregation: mean + higher_is_better: true + - metric: map_4 + aggregation: mean + higher_is_better: true + - metric: map_norm + aggregation: mean + higher_is_better: true + - metric: map_2_norm + aggregation: mean + higher_is_better: true + - metric: map_3_norm + aggregation: mean + higher_is_better: true + - metric: map_4_norm + aggregation: mean + higher_is_better: true +metadata: + - version: 1.0 diff --git a/jlm_fin_eval/tasks/fp2/utils.py b/jlm_fin_eval/tasks/fp2/utils.py new file mode 100644 index 0000000..b279c72 --- /dev/null +++ b/jlm_fin_eval/tasks/fp2/utils.py @@ -0,0 +1,105 @@ +def doc_to_text_00(doc): + doc_text = "【問題】\n" + doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + doc_text += doc["context"] + "\n" + doc_text += "\n【答え】\n" + # doc_text += chr(int(doc["answer"]) + 65) + return doc_text + + +def doc_to_target_00(doc): + answer = doc["choices"]["text"][doc["choices"]["id"].index(doc["answer"])] + return answer + + +def doc_to_choices_00(doc): + choices = [choice for choice in doc["choices"]["text"]] + return choices + + +def doc_to_text_01(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(doc['choices']['text'])}]\n[答え]:" + + +def doc_to_text_01_2(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choice_doc_text = [] + for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): + choice_doc_text.append(chr(choice_id + 65) + ":" + choice_text) + return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(choice_doc_text)}]\n[答え]:" + + +def doc_to_target_01_2(doc): + answer = chr(doc["choices"]["id"].index(doc["answer"]) + 65) + return answer + + +def doc_to_choices_01_2(doc): + choices = [chr(i + 65) for i, _ in enumerate(doc["choices"]["id"])] + return choices + + +def doc_to_text_02(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choices = ",".join( + [ + f"{idx}.{choice}" + for idx, choice in zip(doc["choices"]["id"], doc["choices"]["text"]) + ] + ) + return f"質問:{q_doc_text}" f"選択肢:{choices}\n" "回答:" + + +def doc_to_target_02(doc): + return [ + str(choice_id) + for choice_id in doc["choices"]["id"] + if choice_id == doc["answer"] + ][0] + + +def doc_to_choices_02(doc): + return [str(choice) for choice in doc["choices"]["id"]] + + +def doc_to_text_02_1(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + return f"質問:{q_doc_text}\n回答:" + + +def doc_to_text_03(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + input_text = f"{q_doc_text}" + return f"### 入力:\n{input_text}\n### 応答:\n" + + +def doc_to_text_04(doc, SEP=""): + q_doc_text = doc["question"] + if doc["context"] and doc["context"] != "": + q_doc_text += "\n" + doc["context"] + input_text = f"{q_doc_text}" + return f"ユーザー: {input_text}{SEP}システム: " + + +def doc_to_text_05(doc): + return doc_to_text_04(doc, SEP="\n") + + +def doc_to_text_06(doc): + INSTRUCTION = "与えられた選択肢の中から、最適な答えを選んでください。" + q_doc_text = doc["question"] + if doc["context"] and doc["context"] != "": + q_doc_text += "\n" + doc["context"] + input_text = f"質問:{q_doc_text}" + return f"{INSTRUCTION}\n\n{input_text} [/INST] " diff --git a/jlm_fin_eval/tasks/security_sales_1.py b/jlm_fin_eval/tasks/security_sales_1.py deleted file mode 100644 index 614ed7b..0000000 --- a/jlm_fin_eval/tasks/security_sales_1.py +++ /dev/null @@ -1,262 +0,0 @@ -import inspect -import os - -import numpy as np -from lm_eval.base import MultipleChoiceTask -from lm_eval.base import mean -from lm_eval.base import rf -from sklearn.metrics import accuracy_score - -import jlm_fin_eval.datasets.security_sales_1.security_sales_1 - - -class SecuritySales1(MultipleChoiceTask): - VERSION = 1.0 - DATASET_PATH = inspect.getfile( - jlm_fin_eval.datasets.security_sales_1.security_sales_1 - ) - DATASET_NAME = "security_sales_1" - DESCRIPTION = "以下の問題の適切な答えを選択肢から選んでアルファベットで答えなさい。\n\n" - - def has_training_docs(self): - return False - - def has_validation_docs(self): - return False - - def has_test_docs(self): - return True - - def training_docs(self): - return None - - def validation_docs(self): - return None - - def test_docs(self): - return self.dataset["test"] - - def doc_to_text(self, doc): - doc_text = "【問題】\n" + doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - doc_text += doc["context"] + "\n" - doc_text += "\n【選択肢】\n" - for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): - doc_text += chr(choice_id + 65) + ": " + choice_text + "\n" - doc_text += "\n【答え】\n" - # doc_text += chr(int(doc["answer"]) + 65) - return doc_text - - def doc_to_target(self, doc): - answer = chr(int(doc["answer"]) + 65) - return answer - - @staticmethod - def get_answer(doc): - return chr(int(doc["answer"]) + 65) - - @staticmethod - def compute_scores(gold, pred): - acc = accuracy_score(gold, pred) - - return {"acc": acc} - - def construct_requests(self, doc, ctx): - lls = [ - rf.loglikelihood(ctx, "{}".format(chr(choice + 65)))[0] - for choice in doc["choices"]["id"] - ] - - return lls - - def process_results(self, doc, results): - gold = doc["answer"] - - acc = 1.0 if doc["choices"]["id"][np.argmax(results)] == gold else 0.0 - ranking = [doc["choices"]["id"][i] for i in np.argsort(results)[::-1]] - correct_answer_ranking = ranking.index(gold) + 1 - map_score = 1.0 / correct_answer_ranking - map_2 = 0.0 if correct_answer_ranking > 2 else map_score - map_3 = 0.0 if correct_answer_ranking > 3 else map_score - map_4 = 0.0 if correct_answer_ranking > 4 else map_score - - return { - "acc": acc, - "map": map_score, - "map_2": map_2, - "map_3": map_3, - "map_4": map_4, - } - - def higher_is_better(self): - return {"acc": True, "map": True, "map_2": True, "map_3": True, "map_4": True} - - def aggregation(self): - return { - "acc": mean, - "map": mean, - "map_2": mean, - "map_3": mean, - "map_4": mean, - } - - -class SecuritySales1WithAnlpPrompt(SecuritySales1): - PROMPT_VERSION = 0.1 - DESCRIPTION = "[問題]に対する[答え]を[選択肢]の中から選んでください。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(doc['choices']['text'])}]\n[答え]:" - - def doc_to_target(self, doc): - return [ - choice_text - for choice_id, choice_text in zip( - doc["choices"]["id"], doc["choices"]["text"] - ) - if choice_id == doc["answer"] - ][0] - - def construct_requests(self, doc, ctx): - lls = [rf.loglikelihood(ctx, choice)[0] for choice in doc["choices"]["text"]] - - return lls - - -class SecuritySales1WithAnlpPromptAlphabet(SecuritySales1): - PROMPT_VERSION = "0.1.2" - DESCRIPTION = "[問題]に対する[答え]を[選択肢]の中からアルファベットで選んでください。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choice_doc_text = [] - for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): - choice_doc_text.append(chr(choice_id + 65) + ":" + choice_text) - return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(choice_doc_text)}]\n[答え]:" - - -class SecuritySales1WithFintanPrompt(SecuritySales1): - PROMPT_VERSION = 0.2 - DESCRIPTION = "質問と回答の選択肢を入力として受け取り、選択肢から回答を選択してください。なお、回答は選択肢の番号(例:0)でするものとします。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choices = ",".join( - [ - f"{idx}.{choice}" - for idx, choice in zip(doc["choices"]["id"], doc["choices"]["text"]) - ] - ) - return f"質問:{q_doc_text}" f"選択肢:{choices}\n" "回答:" - - def doc_to_target(self, doc): - return [ - str(choice_id) - for choice_id in doc["choices"]["id"] - if choice_id == doc["answer"] - ][0] - - def construct_requests(self, doc, ctx): - lls = [rf.loglikelihood(ctx, str(choice))[0] for choice in doc["choices"]["id"]] - - return lls - - -class SecuritySales1WithFintanPromptV1(SecuritySales1WithAnlpPrompt): - PROMPT_VERSION = "0.2.1" - DESCRIPTION = "与えられた選択肢の中から、最適な答えを選んでください。\n\n" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) - return f"質問:{q_doc_text}" f"選択肢:\n{choices}\n" "回答:" - - -class SecuritySales1WithAlpacaPrompt(SecuritySales1WithAnlpPrompt): - PROMPT_VERSION = 0.3 - DESCRIPTION = """以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。 - -### 指示: -与えられた選択肢の中から、最適な答えを選んでください。 - -""" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) - input_text = f"{q_doc_text}" + f"出力は以下から選択してください:\n{choices}" - return f"### 入力:\n{input_text}\n\n### 応答:\n" - - -class SecuritySales1WithRinnaInstructionSFT(SecuritySales1WithAnlpPrompt): - PROMPT_VERSION = 0.4 - DESCRIPTION = "ユーザー: 与えられた選択肢の中から、最適な答えを選んでください。システム: 分かりました。" - SEP = "" - FEWSHOT_SEP = "" - - def doc_to_text(self, doc): - q_doc_text = doc["question"] - if doc["context"] and doc["context"] != "": - q_doc_text += "\n" + doc["context"] - choices = self.SEP.join([f"- {choice}" for choice in doc["choices"]["text"]]) - input_text = f"質問:{q_doc_text}{self.SEP}" + f"選択肢:{self.SEP}{choices}" - return f"ユーザー: {input_text}{self.SEP}システム: " - - -class SecuritySales1WithRinnaBilingualInstructionSFT( - SecuritySales1WithRinnaInstructionSFT -): - PROMPT_VERSION = 0.5 - DESCRIPTION = "ユーザー: 与えられた選択肢の中から、最適な答えを選んでください。\nシステム: 分かりました。\n" - SEP = "\n" - FEWSHOT_SEP = "\n" - - -class SecuritySales1WithLlama2(SecuritySales1WithAnlpPrompt): - PROMPT_VERSION = 0.6 - DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" - SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", DEFAULT_SYSTEM_PROMPT) - DESCRIPTION = f"[INST] <>\n{SYSTEM_PROMPT}\n<>\n\n" - INSTRUCTION = "与えられた選択肢の中から、最適な答えを選んでください。" - FEWSHOT_SEP = " [INST] " - - def doc_to_text(self, doc): - q_doc_text = doc["question"] + "\n" - if doc["context"] and doc["context"] != "": - q_doc_text += doc["context"] + "\n" - choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) - input_text = f"質問:{q_doc_text}" + f"出力は以下から選択してください:\n{choices}" - return f"{self.INSTRUCTION}\n\n{input_text} [/INST] " - - -VERSIONS = [ - SecuritySales1WithAnlpPrompt, - SecuritySales1WithAnlpPromptAlphabet, - SecuritySales1WithFintanPrompt, - SecuritySales1WithFintanPromptV1, - SecuritySales1WithAlpacaPrompt, - SecuritySales1WithRinnaInstructionSFT, - SecuritySales1WithRinnaBilingualInstructionSFT, - SecuritySales1WithLlama2, -] - - -def construct_tasks(): - tasks = {} - for version_class in VERSIONS: - tasks[ - f"security_sales_1-{version_class.VERSION}-{version_class.PROMPT_VERSION}" - ] = version_class - tasks["security_sales_1"] = SecuritySales1 - return tasks diff --git a/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.1.2.yaml b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.1.2.yaml new file mode 100644 index 0000000..ecd01cc --- /dev/null +++ b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.1.2.yaml @@ -0,0 +1,4 @@ +include: security_sales_1.yaml +task: security_sales_1-1.0-0.1.2 +description: "[問題]に対する[答え]を[選択肢]の中からアルファベットで選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_01_2 diff --git a/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.1.yaml b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.1.yaml new file mode 100644 index 0000000..bd1e214 --- /dev/null +++ b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.1.yaml @@ -0,0 +1,6 @@ +include: security_sales_1.yaml +task: security_sales_1-1.0-0.1 +description: "[問題]に対する[答え]を[選択肢]の中から選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_01 +doc_to_target: !function utils.doc_to_target_01 +doc_to_choice: !function utils.doc_to_choices_01 diff --git a/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.2.1.yaml b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.2.1.yaml new file mode 100644 index 0000000..0015e2a --- /dev/null +++ b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.2.1.yaml @@ -0,0 +1,4 @@ +include: security_sales_1-1.0-0.1.yaml +task: security_sales_1-1.0-0.2.1 +description: "与えられた選択肢の中から、最適な答えを選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_02_1 diff --git a/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.2.yaml b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.2.yaml new file mode 100644 index 0000000..3a8776b --- /dev/null +++ b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.2.yaml @@ -0,0 +1,6 @@ +include: security_sales_1.yaml +task: security_sales_1-1.0-0.2 +description: "質問と回答の選択肢を入力として受け取り、選択肢から回答を選択してください。なお、回答は選択肢の番号(例:0)でするものとします。\n\n\n" +doc_to_text: !function utils.doc_to_text_02 +doc_to_target: !function utils.doc_to_target_02 +doc_to_choice: !function utils.doc_to_choices_02 diff --git a/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.3.yaml b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.3.yaml new file mode 100644 index 0000000..8362e5c --- /dev/null +++ b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.3.yaml @@ -0,0 +1,4 @@ +include: security_sales_1-1.0-0.1.yaml +task: security_sales_1-1.0-0.3 +description: "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n### 指示:\n与えられた選択肢の中から、最適な答えを選んでください。\n\n\n" +doc_to_text: !function utils.doc_to_text_03 diff --git a/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.4.yaml b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.4.yaml new file mode 100644 index 0000000..6857acf --- /dev/null +++ b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.4.yaml @@ -0,0 +1,5 @@ +include: security_sales_1-1.0-0.1.yaml +task: security_sales_1-1.0-0.4 +description: "ユーザー: 与えられた選択肢の中から、最適な答えを選んでください。システム: 分かりました。" +fewshot_delimiter: "" +doc_to_text: !function utils.doc_to_text_04 diff --git a/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.5.yaml b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.5.yaml new file mode 100644 index 0000000..ea3288f --- /dev/null +++ b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.5.yaml @@ -0,0 +1,5 @@ +include: security_sales_1-1.0-0.4.yaml +task: security_sales_1-1.0-0.5 +description: "ユーザー: 与えられた選択肢の中から、最適な答えを選んでください。\nシステム: 分かりました。\n\n" +fewshot_delimiter: "\n" +doc_to_text: !function utils.doc_to_text_05 diff --git a/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.6.yaml b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.6.yaml new file mode 100644 index 0000000..764fa3a --- /dev/null +++ b/jlm_fin_eval/tasks/security_sales_1/security_sales_1-1.0-0.6.yaml @@ -0,0 +1,5 @@ +include: security_sales_1-1.0-0.1.yaml +task: security_sales_1-1.0-0.6 +description: "[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<>\n\n\n" +fewshot_delimiter: " [INST] " +doc_to_text: !function utils.doc_to_text_06 diff --git a/jlm_fin_eval/tasks/security_sales_1/security_sales_1.yaml b/jlm_fin_eval/tasks/security_sales_1/security_sales_1.yaml new file mode 100644 index 0000000..a5931cc --- /dev/null +++ b/jlm_fin_eval/tasks/security_sales_1/security_sales_1.yaml @@ -0,0 +1,48 @@ +task: security_sales_1 +dataset_path: jlm_fin_eval/datasets/security_sales_1/security_sales_1.py +dataset_name: security_sales_1 +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +description: "以下の問題の適切な答えを選択肢から選んでアルファベットで答えなさい。\n\n\n" +target_delimiter: "" +fewshot_delimiter: "\n\n" +doc_to_text: !function utils.doc_to_text_00 +doc_to_target: !function utils.doc_to_target_alphabet +doc_to_choice: !function utils.doc_to_choices_alphabet +should_decontaminate: true +doc_to_decontamination_query: sentence +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: map + aggregation: mean + higher_is_better: true + - metric: map_2 + aggregation: mean + higher_is_better: true + - metric: map_3 + aggregation: mean + higher_is_better: true + - metric: map_4 + aggregation: mean + higher_is_better: true + - metric: map_norm + aggregation: mean + higher_is_better: true + - metric: map_2_norm + aggregation: mean + higher_is_better: true + - metric: map_3_norm + aggregation: mean + higher_is_better: true + - metric: map_4_norm + aggregation: mean + higher_is_better: true +metadata: + - version: 1.0 diff --git a/jlm_fin_eval/tasks/security_sales_1/utils.py b/jlm_fin_eval/tasks/security_sales_1/utils.py new file mode 100644 index 0000000..9c45789 --- /dev/null +++ b/jlm_fin_eval/tasks/security_sales_1/utils.py @@ -0,0 +1,115 @@ +def doc_to_text_00(doc): + doc_text = "【問題】\n" + doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + doc_text += doc["context"] + "\n" + doc_text += "\n【選択肢】\n" + for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): + doc_text += chr(choice_id + 65) + ": " + choice_text + "\n" + doc_text += "\n【答え】\n" + # doc_text += chr(int(doc["answer"]) + 65) + return doc_text + + +def doc_to_target_alphabet(doc): + answer = chr(int(doc["answer"]) + 65) + return answer + + +def doc_to_choices_alphabet(doc): + choices = [chr(choice_id + 65) for choice_id in doc["choices"]["id"]] + return choices + + +def doc_to_text_01(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(doc['choices']['text'])}]\n[答え]:" + + +def doc_to_target_01(doc): + return [ + choice_text + for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]) + if choice_id == doc["answer"] + ][0] + + +def doc_to_choices_01(doc): + choices = doc["choices"]["text"] + return choices + + +def doc_to_text_01_2(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choice_doc_text = [] + for choice_id, choice_text in zip(doc["choices"]["id"], doc["choices"]["text"]): + choice_doc_text.append(chr(choice_id + 65) + ":" + choice_text) + return f"[問題]:{q_doc_text}[選択肢]:[{', '.join(choice_doc_text)}]\n[答え]:" + + +def doc_to_text_02(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choices = ",".join( + [ + f"{idx}.{choice}" + for idx, choice in zip(doc["choices"]["id"], doc["choices"]["text"]) + ] + ) + return f"質問:{q_doc_text}" f"選択肢:{choices}\n" "回答:" + + +def doc_to_target_02(doc): + return [ + str(choice_id) + for choice_id in doc["choices"]["id"] + if choice_id == doc["answer"] + ][0] + + +def doc_to_choices_02(doc): + return [str(choice) for choice in doc["choices"]["id"]] + + +def doc_to_text_02_1(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) + return f"質問:{q_doc_text}選択肢:\n{choices}\n回答:" + + +def doc_to_text_03(doc): + q_doc_text = doc["question"] + "\n" + if doc["context"] and doc["context"] != "": + q_doc_text += doc["context"] + "\n" + choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) + input_text = f"{q_doc_text}" + f"出力は以下から選択してください:\n{choices}" + return f"### 入力:\n{input_text}\n\n### 応答:\n" + + +def doc_to_text_04(doc, SEP=""): + q_doc_text = doc["question"] + if doc["context"] and doc["context"] != "": + q_doc_text += "\n" + doc["context"] + choices = SEP.join([f"- {choice}" for choice in doc["choices"]["text"]]) + input_text = f"質問:{q_doc_text}{SEP}" + f"選択肢:{SEP}{choices}" + return f"ユーザー: {input_text}{SEP}システム: " + + +def doc_to_text_05(doc): + return doc_to_text_04(doc, SEP="\n") + + +def doc_to_text_06(doc): + INSTRUCTION = "与えられた選択肢の中から、最適な答えを選んでください。" + q_doc_text = doc["question"] + if doc["context"] and doc["context"] != "": + q_doc_text += "\n" + doc["context"] + choices = "\n".join([f"- {choice}" for choice in doc["choices"]["text"]]) + input_text = f"質問:{q_doc_text}" + f"出力は以下から選択してください:\n{choices}" + return f"{INSTRUCTION}\n\n{input_text} [/INST] " diff --git a/main.py b/main.py index a958862..23d62fc 100644 --- a/main.py +++ b/main.py @@ -1,194 +1,320 @@ import argparse -import fnmatch import json +import logging import os -from typing import Iterator -from typing import List +import re +import sys +from pathlib import Path +from typing import Union -import openai +import numpy as np +from lm_eval import evaluator from lm_eval import utils -from lm_eval.models.gpt3 import GPT3LM -from lm_eval.models.gpt3 import get_result -from lm_eval.models.gpt3 import oa_completion -from tqdm import tqdm -from transformers.models.auto.tokenization_auto import AutoTokenizer - -from jlm_fin_eval import evaluator -from jlm_fin_eval import tasks - -openai.api_type = os.environ.get("OPENAI_API_TYPE", "open_ai") -openai.api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1") -openai.api_version = os.environ.get("OPENAI_API_VERSION") -openai.api_key = os.environ.get("OPENAI_API_SECRET_KEY") - - -def from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs): - return AutoTokenizer._from_pretrained( - pretrained_model_name_or_path, *inputs, trust_remote_code=True, **kwargs - ) - - -class MultiChoice: - def __init__(self, choices: List[str]) -> None: - self.choices = choices - - # Simple wildcard support (linux filename patterns) - def __contains__(self, values: str) -> bool: - for value in values.split(","): - if len(fnmatch.filter(self.choices, value)) == 0: - return False - - return True - - def __iter__(self) -> Iterator[str]: - for choice in self.choices: - yield choice - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True) - parser.add_argument("--model_args", default="") - parser.add_argument("--tasks", default=None, choices=MultiChoice(tasks.ALL_TASKS)) - parser.add_argument("--provide_description", action="store_true") - parser.add_argument("--num_fewshot", type=str, default="0") - parser.add_argument("--batch_size", type=int, default=None) - parser.add_argument("--device", type=str, default=None) - parser.add_argument("--output_path", default=None) - parser.add_argument("--limit", type=int, default=None) - parser.add_argument("--no_cache", action="store_true") - parser.add_argument("--decontamination_ngrams_path", default=None) - parser.add_argument("--description_dict_path", default=None) - parser.add_argument("--check_integrity", action="store_true") - - return parser.parse_args() - - -# Returns a list containing all values of the source_list that -# match at least one of the patterns -def pattern_match(patterns: List[str], source_list: List[str]) -> List[str]: - task_names = set() - for pattern in patterns: - for matching in fnmatch.filter(source_list, pattern): - task_names.add(matching) - return sorted(list(task_names)) - - -def _loglikelihood_tokens(self, requests, disable_tqdm=False): - res = [] - - def _collate(x): - # this doesn't efficiently handle last-token differences yet, but those are kinda annoying because - # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations - # we care about and so we need some kind of backup for when it isn't - toks = x[1] + x[2] - return -len(toks), tuple(toks) - - re_ord = utils.Reorderer(requests, _collate) - - for chunk in tqdm( - list(utils.chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)), - disable=disable_tqdm, - ): - inps = [] - ctxlens = [] - for cache_key, context_enc, continuation_enc in chunk: - # max_length+1 because the API takes up to 2049 tokens, including the first context token - inp = (context_enc + continuation_enc)[-(self.max_length + 1) :] - # TODO: the logic is much simpler if we just look at the length of continuation tokens - ctxlen = len(context_enc) - max( - 0, len(context_enc) + len(continuation_enc) - (self.max_length + 1) +from lm_eval.__main__ import DEFAULT_RESULTS_FILE +from lm_eval.__main__ import _handle_non_serializable +from lm_eval.__main__ import parse_eval_args +from lm_eval.__main__ import setup_parser +from lm_eval.api.task import ConfigurableTask +from lm_eval.evaluator import request_caching_arg_to_dict +from lm_eval.logging_utils import WandbLogger +from lm_eval.utils import make_table +from lm_eval.utils import simple_parse_args_string + +from jlm_fin_eval.tasks import TaskManager + +ConfigurableTask.original_process_results = ConfigurableTask.process_results + +eval_logger = logging.getLogger("lm-eval") + + +def process_results(self: ConfigurableTask, doc: dict, results: dict) -> dict: + result_dict = self.original_process_results(doc, results) + use_metric = list(self._metric_fn_list.keys()) + metrics = list(set(use_metric) - set(result_dict.keys())) + if len(metrics) > 0: + if self.OUTPUT_TYPE == "multiple_choice": + lls, is_greedy = zip(*results) + choices = self.doc_to_choice(doc) + completion_len = np.array([float(len(i)) for i in choices]) + + pred = np.argmax(lls) + pred_norm = np.argmax(lls / completion_len) + + if self.multiple_input: + gold = self.doc_to_text(doc) + else: + gold = self.doc_to_target(doc) + + gold_index_error = False + if isinstance(gold, list): + gold = [i if i < len(choices) else -100 for i in gold] + if -100 in gold: + gold_index_error = True + else: + if isinstance(gold, int): + gold = gold if gold < len(choices) else -100 + elif isinstance(gold, str): + gold = choices.index(gold) if gold in choices else -100 + + if gold == -100: + gold_index_error = True + + if gold_index_error: + eval_logger.warning( + f"Label index was not in within range of available choices," + f"Sample:\n\n{doc}\n\n" + ) + + if self.multiple_target: + acc = 1.0 if pred in gold else 0.0 + acc_norm = 1.0 if pred_norm in gold else 0.0 + exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold])) + else: + acc = 1.0 if pred == gold else 0.0 + acc_norm = 1.0 if pred_norm == gold else 0.0 + # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly + exact_match = int(is_greedy[gold]) if gold != -100 else 0 + + if ( + len( + set( + [ + "map", + "map_2", + "map_3", + "map_4", + "map_norm", + "map_2_norm", + "map_3_norm", + "map_4_norm", + ] + ) + & set(use_metric) + ) + != 0 + ): + ranking = np.argsort(lls)[::-1].tolist() + ranking_norm = np.argsort(lls / completion_len)[::-1].tolist() + correct_answer_ranking = ranking.index(gold) + 1 + correct_answer_ranking_norm = ranking_norm.index(gold) + 1 + map_score = 1.0 / correct_answer_ranking + map_2 = 0.0 if correct_answer_ranking > 2 else map_score + map_3 = 0.0 if correct_answer_ranking > 3 else map_score + map_4 = 0.0 if correct_answer_ranking > 4 else map_score + map_score_norm = 1.0 / correct_answer_ranking_norm + map_2_norm = 0.0 if correct_answer_ranking_norm > 2 else map_score_norm + map_3_norm = 0.0 if correct_answer_ranking_norm > 3 else map_score_norm + map_4_norm = 0.0 if correct_answer_ranking_norm > 4 else map_score_norm + + result_dict.update( + { + **( + {"f1_norm": (gold, pred_norm)} + if "f1_norm" in use_metric + else {} + ), + **({"map": map_score} if "map" in use_metric else {}), + **({"map_2": map_2} if "map_2" in use_metric else {}), + **({"map_3": map_3} if "map_3" in use_metric else {}), + **({"map_4": map_4} if "map_4" in use_metric else {}), + **( + {"map_norm": map_score_norm} if "map_norm" in use_metric else {} + ), + **( + {"map_2_norm": map_2_norm} if "map_2_norm" in use_metric else {} + ), + **( + {"map_3_norm": map_3_norm} if "map_3_norm" in use_metric else {} + ), + **( + {"map_4_norm": map_4_norm} if "map_4_norm" in use_metric else {} + ), + } ) + else: + raise NotImplementedError - inps.append(inp) - ctxlens.append(ctxlen) - - response = oa_completion( - engine=self.engine, - prompt=[self.tok_decode(inp) for inp in inps], - echo=True, - max_tokens=0, - temperature=0.0, - logprobs=10, - ) - - for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip( - response.choices, ctxlens, chunk - ): - answer = get_result(resp, ctxlen) - - res.append(answer) + return result_dict - # partial caching - if cache_key is not None: - self.cache_hook.add_partial("loglikelihood", cache_key, answer) - return re_ord.get_original(res) +ConfigurableTask.process_results = process_results -GPT3LM._loglikelihood_tokens = _loglikelihood_tokens +def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: + if not args: + # we allow for args to be passed externally, else we parse them ourselves + parser = setup_parser() + args = parse_eval_args(parser) + if args.wandb_args: + wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args)) -def main() -> None: - args = parse_args() + eval_logger = utils.eval_logger + eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) + eval_logger.info(f"Verbosity set to {args.verbosity}") + os.environ["TOKENIZERS_PARALLELISM"] = "false" - assert not args.provide_description # not implemented + if args.predict_only: + args.log_samples = True + if (args.log_samples or args.predict_only) and not args.output_path: + raise ValueError( + "Specify --output_path if providing --log_samples or --predict_only" + ) - if "trust_remote_code=True" in args.model_args: - AutoTokenizer._from_pretrained = AutoTokenizer.from_pretrained - AutoTokenizer.from_pretrained = from_pretrained + if args.include_path is not None: + eval_logger.info(f"Including path: {args.include_path}") + task_manager = TaskManager(args.verbosity, include_path=args.include_path) if args.limit: - print( - "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." + eval_logger.warning( + " --limit SHOULD ONLY BE USED FOR TESTING." + "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) if args.tasks is None: - task_names = tasks.ALL_TASKS + eval_logger.error("Need to specify task to evaluate.") + sys.exit() + elif args.tasks == "list": + eval_logger.info( + "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks)) + ) + sys.exit() else: - task_names = pattern_match(args.tasks.split(","), tasks.ALL_TASKS) + if os.path.isdir(args.tasks): + import glob + + task_names = [] + yaml_path = os.path.join(args.tasks, "*.yaml") + for yaml_file in glob.glob(yaml_path): + config = utils.load_yaml_config(yaml_file) + task_names.append(config) + else: + task_list = args.tasks.split(",") + task_names = task_manager.match_tasks(task_list) + for task in [task for task in task_list if task not in task_names]: + if os.path.isfile(task): + config = utils.load_yaml_config(task) + task_names.append(config) + task_missing = [ + task for task in task_list if task not in task_names and "*" not in task + ] # we don't want errors if a wildcard ("*") task name was used + + if task_missing: + missing = ", ".join(task_missing) + eval_logger.error( + f"Tasks were not found: {missing}\n" + f"{utils.SPACING}Try `python main.py --tasks list` for list of available tasks", + ) + raise ValueError( + f"Tasks not found: {missing}. Try `python main.py --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues." + ) - print(f"Selected Tasks: {task_names}") + if args.output_path: + path = Path(args.output_path) + # check if file or 'dir/results.json' exists + if path.is_file(): + raise FileExistsError(f"File already exists at {path}") + output_path_file = path.joinpath(DEFAULT_RESULTS_FILE) + if output_path_file.is_file(): + eval_logger.warning( + f"File {output_path_file} already exists. Results will be overwritten." + ) + # if path json then get parent dir + elif path.suffix in (".json", ".jsonl"): + output_path_file = path + path.parent.mkdir(parents=True, exist_ok=True) + path = path.parent + else: + path.mkdir(parents=True, exist_ok=True) + + # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args + if args.trust_remote_code: + os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code) + args.model_args = ( + args.model_args + + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}" + ) - if args.num_fewshot is not None: - num_fewshot = [int(n) for n in args.num_fewshot.split(",")] - if len(args.num_fewshot) == 1: - num_fewshot = [num_fewshot[0] for _ in task_names] - else: - num_fewshot = [0 for _ in task_names] + eval_logger.info(f"Selected Tasks: {task_names}") + eval_logger.info("Loading selected tasks...") - description_dict = {} - if args.description_dict_path: - with open(args.description_dict_path, "r") as f: - description_dict = json.load(f) + request_caching_args = request_caching_arg_to_dict( + cache_requests=args.cache_requests + ) results = evaluator.simple_evaluate( model=args.model, model_args=args.model_args, tasks=task_names, - num_fewshot=num_fewshot, + num_fewshot=args.num_fewshot, batch_size=args.batch_size, + max_batch_size=args.max_batch_size, device=args.device, - no_cache=args.no_cache, + use_cache=args.use_cache, limit=args.limit, - description_dict=description_dict, - decontamination_ngrams_path=args.decontamination_ngrams_path, check_integrity=args.check_integrity, + write_out=args.write_out, + log_samples=args.log_samples, + gen_kwargs=args.gen_kwargs, + task_manager=task_manager, + verbosity=args.verbosity, + predict_only=args.predict_only, + random_seed=args.seed[0], + numpy_random_seed=args.seed[1], + torch_random_seed=args.seed[2], + **request_caching_args, ) - dumped = json.dumps(results, indent=2) - print(dumped) + if results is not None: + if args.log_samples: + samples = results.pop("samples") + dumped = json.dumps( + results, indent=2, default=_handle_non_serializable, ensure_ascii=False + ) + if args.show_config: + print(dumped) + + batch_sizes = ",".join(map(str, results["config"]["batch_sizes"])) + + # Add W&B logging + if args.wandb_args: + try: + wandb_logger.post_init(results) + wandb_logger.log_eval_result() + if args.log_samples: + wandb_logger.log_eval_samples(samples) + except Exception as e: + eval_logger.info(f"Logging to Weights and Biases failed due to {e}") + + if args.output_path: + output_path_file.open("w", encoding="utf-8").write(dumped) + + if args.log_samples: + for task_name, config in results["configs"].items(): + output_name = "{}_{}".format( + re.sub("/|=", "__", args.model_args), task_name + ) + filename = path.joinpath(f"{output_name}.jsonl") + samples_dumped = json.dumps( + samples[task_name], + indent=2, + default=_handle_non_serializable, + ensure_ascii=False, + ) + filename.write_text(samples_dumped, encoding="utf-8") - if args.output_path: - with open(args.output_path, "w") as f: - f.write(dumped) + print( + f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " + f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" + ) + print(make_table(results)) + if "groups" in results: + print(make_table(results, "groups")) - print( - f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, " - f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}" - ) - print(evaluator.make_table(results)) + if args.wandb_args: + # Tear down wandb run once all the logging is done. + wandb_logger.run.finish() if __name__ == "__main__": - main() + cli_evaluate() diff --git a/pyproject.toml b/pyproject.toml index fcd7603..9a6bdc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,12 +8,13 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.9,<3.13" -lm-eval = {git = "https://github.com/Stability-AI/lm-evaluation-harness", rev = "jp-stable"} +lm-eval = {git = "https://github.com/EleutherAI/lm-evaluation-harness.git"} emoji = "^2.8.0" fugashi = "1.2.1" neologdn = ">=0.5.2" unidic-lite = "1.0.8" -torch = "2.0.0" +torch = "2.1.2" +datasets = "^2.15.0" accelerate = "^0.24.0" bitsandbytes = "^0.41.1" sentencepiece = "^0.1.99" @@ -25,6 +26,7 @@ openai = "0.28.1" transformers = "^4.36.2" tiktoken = "^0.5.2" transformers-stream-generator = "^0.0.4" +vllm = "^0.3.3" [tool.poetry.group.dev.dependencies]