From 147bd69c93edd4c57744a9eba4fcd1ac29543597 Mon Sep 17 00:00:00 2001 From: Jake Tae Date: Sat, 14 Aug 2021 01:49:28 +0900 Subject: [PATCH 01/10] refactor: restructure directory structure --- .../{scripts/simple_benchmark.py => eval.py} | 0 evaluation/{datasets => tasks}/__init__.py | 0 .../tydiqa_primary}/__init__.py | 0 .../tydiqa_primary/tydiqa_primary.py} | 0 evaluation/tasks/tydiqa_secondary/__init__.py | 0 .../tydiqa_secondary/tydiqa_secondary.py | 59 +++++++++++++++++++ evaluation/{datasets => tasks/wmt}/wmt.py | 0 7 files changed, 59 insertions(+) rename evaluation/{scripts/simple_benchmark.py => eval.py} (100%) rename evaluation/{datasets => tasks}/__init__.py (100%) rename evaluation/{scripts => tasks/tydiqa_primary}/__init__.py (100%) rename evaluation/{datasets/tydiqa.py => tasks/tydiqa_primary/tydiqa_primary.py} (100%) create mode 100644 evaluation/tasks/tydiqa_secondary/__init__.py create mode 100644 evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py rename evaluation/{datasets => tasks/wmt}/wmt.py (100%) diff --git a/evaluation/scripts/simple_benchmark.py b/evaluation/eval.py similarity index 100% rename from evaluation/scripts/simple_benchmark.py rename to evaluation/eval.py diff --git a/evaluation/datasets/__init__.py b/evaluation/tasks/__init__.py similarity index 100% rename from evaluation/datasets/__init__.py rename to evaluation/tasks/__init__.py diff --git a/evaluation/scripts/__init__.py b/evaluation/tasks/tydiqa_primary/__init__.py similarity index 100% rename from evaluation/scripts/__init__.py rename to evaluation/tasks/tydiqa_primary/__init__.py diff --git a/evaluation/datasets/tydiqa.py b/evaluation/tasks/tydiqa_primary/tydiqa_primary.py similarity index 100% rename from evaluation/datasets/tydiqa.py rename to evaluation/tasks/tydiqa_primary/tydiqa_primary.py diff --git a/evaluation/tasks/tydiqa_secondary/__init__.py b/evaluation/tasks/tydiqa_secondary/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py b/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py new file mode 100644 index 0000000..be6838b --- /dev/null +++ b/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py @@ -0,0 +1,59 @@ +# Module for any additional processing required for the TyDi QA dataset +# HuggingFace dataset link: https://huggingface.co/datasets/tydiqa + +from jinja2 import Template +from torch.utils.data import Dataset + +TEMPLATE = Template( + """ + {%- set _blank=["passage", "text", "text snippet", "context"]|random -%} + {%- set _position = ["above", "following"] |random -%} + {%- if _position == "above" -%} + {{context}}{{"\n"}} + {%- endif -%} + Given the {{_position}} {{_blank}}, answer the question: {{question}} + {%- if _position == "following" -%} + {{"\n"}}{{context}} + {%- endif -%} + {{"\n"}}Answer: + """ +) + +class TyDiQADataset(Dataset): + def __init__(self, data, tokenizer, target_langs): + super(TyDiQADataset, self).__init__() + self.items = [] + + for sample_id, sample in enumerate(data): + lang = sample["id"].split("-")[0] + if lang in target_langs: + # Filter out samples in languages that are not used during training + prompt = TEMPLATE.render( + id = sample["id"], + context = sample["context"], + question = sample["question"], + ) + prompt = prompt.strip() # Remove trailing white space and newline + + # Tokenize and construct this sample + inputs = tokenizer( + prompt, + padding=True, + return_tensors='pt', + ) + self.items.append( + { + "prompt": prompt, + "lang": lang, + "input_ids": inputs["input_ids"], + "attention_mask": inputs["attention_mask"], + "input_len": inputs["attention_mask"].shape[1], + "target_answer": [ans.lower() for ans in sample["answers"]['text']], + } + ) + + def __len__(self): + return len(self.items) + + def __getitem__(self, index): + return self.items[index] diff --git a/evaluation/datasets/wmt.py b/evaluation/tasks/wmt/wmt.py similarity index 100% rename from evaluation/datasets/wmt.py rename to evaluation/tasks/wmt/wmt.py From f040061b49039d9ac4520f5d86104d09bc515715 Mon Sep 17 00:00:00 2001 From: Wilson Lee Date: Fri, 13 Aug 2021 14:53:14 -0700 Subject: [PATCH 02/10] refactor: create AutoTask class + implement loop and eval_tasks in eval.py --- README.md | 2 +- evaluation/eval.py | 54 +++++++------------ evaluation/tasks/__init__.py | 9 ++++ evaluation/tasks/auto_task.py | 42 +++++++++++++++ .../tydiqa_secondary/tydiqa_secondary.py | 37 +++++++++++++ 5 files changed, 108 insertions(+), 36 deletions(-) create mode 100644 evaluation/tasks/auto_task.py diff --git a/README.md b/README.md index aca9888..78fc126 100644 --- a/README.md +++ b/README.md @@ -75,5 +75,5 @@ A [simple benchmark](https://github.com/bigscience-workshop/Megatron-DeepSpeed/i [WMT](https://huggingface.co/datasets/wmt19) and [TyDi QA](https://huggingface.co/datasets/tydiqa) E.g. ```shell -python3 -m evaluation.scripts.simple_benchmark --model_name_or_path=gpt2 +python3 -m evaluation.eval --model_name_or_path=gpt2 --eval_tasks tydiqa_secondary ``` diff --git a/evaluation/eval.py b/evaluation/eval.py index 8de06bd..6bc24ed 100644 --- a/evaluation/eval.py +++ b/evaluation/eval.py @@ -1,21 +1,19 @@ import logging from dataclasses import dataclass, field from datetime import datetime -from typing import Optional +from typing import Optional, List import os import torch -from datasets import load_dataset -from tqdm import tqdm from transformers import ( HfArgumentParser, AutoTokenizer, AutoModelForCausalLM, set_seed, ) +import evaluation.tasks # needed for AutoTask.__subclass__() to work correctly +from evaluation.tasks.auto_task import AutoTask -from evaluation.datasets.tydiqa import TyDiQADataset -from evaluation.utils.io import save_json logger = logging.getLogger(__name__) @@ -47,6 +45,10 @@ class EvaluationArguments: default=24, metadata={"help": "Customized random seed"} ) + eval_tasks: Optional[List[str]] = field( + default=None, + metadata={"help": "A list of tasks to run the evaluation on, e.g. tydiqa_secondary"} + ) def main(): @@ -64,6 +66,9 @@ def main(): # set random seed set_seed(eval_args.random_seed) + if not eval_args.eval_tasks: + raise ValueError('Must provide at least one eval task!') + logger.info("Beginning evaluation") # Load model & tokenizer @@ -77,40 +82,19 @@ def main(): model.resize_token_embeddings(len(tokenizer)) model.to(torch_device) - # Load dataset - logger.info("Benchmarking TyDiQA...") - target_langs = ["english"] - data = load_dataset("tydiqa", "secondary_task", split="validation") - dataset = TyDiQADataset(data, tokenizer, target_langs) - - tydiqa_substring_matches = 0 - for sample in tqdm(dataset): - output = model.generate( - input_ids=sample["input_ids"].to(torch_device), - attention_mask=sample["attention_mask"].to(torch_device), - max_length=min(sample["input_len"]*2, model.config.n_positions), - ) - - prompt_len = len(sample["prompt"]) - decoded_output = tokenizer.decode(output[0], skip_special_tokens=True) - predicted_answer = decoded_output[prompt_len:] - - target_answers = sample["target_answer"] - substring_match = any([target_answer in predicted_answer.lower() for target_answer in target_answers]) - tydiqa_substring_matches += substring_match - tydiqa_metrics = { - "substring_matches": tydiqa_substring_matches / len(dataset) * 100 - } - logger.info(f"TyDiQA: {tydiqa_metrics['substring_matches']}% of samples contain substring matches") - # Exporting results + output_dir = None if eval_args.output_dir: output_dir = os.path.join(eval_args.output_dir, datetime.now().strftime("%y%m%d_%H%M%S")) os.makedirs(output_dir, exist_ok=True) - # Exporting TyDiQA results - tydiqa_filename = os.path.join(output_dir, "tydiqa.json") - save_json(tydiqa_metrics, tydiqa_filename) - logger.info(f"TyDiQA: result exported to {tydiqa_filename}") + + for eval_task in eval_args.eval_tasks: + logger.info(f"Benchmarking {eval_task}...") + task = AutoTask.from_task_name(eval_task, tokenizer=tokenizer, model=model) + task.evaluate() + + if output_dir: + task.save_metrics(output_dir, logger) if __name__ == "__main__": diff --git a/evaluation/tasks/__init__.py b/evaluation/tasks/__init__.py index e69de29..acd0f36 100644 --- a/evaluation/tasks/__init__.py +++ b/evaluation/tasks/__init__.py @@ -0,0 +1,9 @@ +# recursively import every submodule at runtime +# source: https://stackoverflow.com/questions/3365740/how-to-import-all-submodules +import pkgutil + +__all__ = [] +for loader, module_name, is_pkg in pkgutil.walk_packages(__path__): + __all__.append(module_name) + _module = loader.find_module(module_name).load_module(module_name) + globals()[module_name] = _module diff --git a/evaluation/tasks/auto_task.py b/evaluation/tasks/auto_task.py new file mode 100644 index 0000000..03b8157 --- /dev/null +++ b/evaluation/tasks/auto_task.py @@ -0,0 +1,42 @@ +from abc import ABC, abstractmethod +import os + +import torch + +from evaluation.utils.io import save_json + + +class AutoTask(ABC): + def __init__(self, tokenizer, model): + self.tokenizer = tokenizer + self.model = model + self.torch_device = "cuda" if torch.cuda.is_available() else "cpu" + self.metrics = {} + + @classmethod + def from_task_name(cls, task_name: str, tokenizer, model): + all_tasks = cls.__subclasses__() + matched_task = [task for task in all_tasks if task.get_display_name() == task_name] + + if not matched_task: + raise ValueError(f'Invalid task: {task_name}') + + return matched_task[0](tokenizer=tokenizer, model=model) + + @staticmethod + @abstractmethod + def get_display_name() -> str: + pass + + @abstractmethod + def evaluate(self) -> None: + pass + + def save_metrics(self, output_dir, logger=None) -> str: + # Exporting TyDiQA results + output_filename = os.path.join(output_dir, f"{self.get_display_name()}.json") + save_json(self.metrics, output_filename) + + if logger: + logger.info(f"{self.get_display_name()}: result exported to {output_filename}") + return output_filename diff --git a/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py b/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py index be6838b..90e9d8e 100644 --- a/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py +++ b/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py @@ -1,8 +1,13 @@ # Module for any additional processing required for the TyDi QA dataset # HuggingFace dataset link: https://huggingface.co/datasets/tydiqa +from typing import Dict from jinja2 import Template from torch.utils.data import Dataset +from datasets import load_dataset +from tqdm import tqdm + +from evaluation.tasks.auto_task import AutoTask TEMPLATE = Template( """ @@ -19,6 +24,7 @@ """ ) + class TyDiQADataset(Dataset): def __init__(self, data, tokenizer, target_langs): super(TyDiQADataset, self).__init__() @@ -57,3 +63,34 @@ def __len__(self): def __getitem__(self, index): return self.items[index] + + +class TydiqaSecondaryTask(AutoTask): + @staticmethod + def get_display_name() -> str: + return 'tydiqa_secondary' + + def evaluate(self) -> None: + target_langs = ["english"] + data = load_dataset("tydiqa", "secondary_task", split="validation") + dataset = TyDiQADataset(data, self.tokenizer, target_langs) + + substring_matches = 0 + for sample in tqdm(dataset, desc=f'Evaluating {self.get_display_name()}'): + output = self.model.generate( + input_ids=sample["input_ids"].to(self.torch_device), + attention_mask=sample["attention_mask"].to(self.torch_device), + max_length=min(sample["input_len"] * 2, self.model.config.n_positions), + ) + + prompt_len = len(sample["prompt"]) + decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True) + predicted_answer = decoded_output[prompt_len:] + + target_answers = sample["target_answer"] + substring_match = any([target_answer in predicted_answer.lower() for target_answer in target_answers]) + substring_matches += substring_match + + self.metrics = { + "substring_matches": substring_matches / len(dataset) * 100 + } From a1d728677ea6437f4d7a64e63e84d727c2b16acb Mon Sep 17 00:00:00 2001 From: Jake Tae Date: Sun, 15 Aug 2021 19:10:55 +0900 Subject: [PATCH 03/10] chore: rm obsolete comment Co-authored-by: tttyuntian <33341186+tttyuntian@users.noreply.github.com> --- evaluation/tasks/auto_task.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evaluation/tasks/auto_task.py b/evaluation/tasks/auto_task.py index 03b8157..54957de 100644 --- a/evaluation/tasks/auto_task.py +++ b/evaluation/tasks/auto_task.py @@ -33,7 +33,6 @@ def evaluate(self) -> None: pass def save_metrics(self, output_dir, logger=None) -> str: - # Exporting TyDiQA results output_filename = os.path.join(output_dir, f"{self.get_display_name()}.json") save_json(self.metrics, output_filename) From e91d5aba3dd891f69ad02379b5fedb80de211d65 Mon Sep 17 00:00:00 2001 From: Jake Tae Date: Sun, 15 Aug 2021 20:44:36 +0900 Subject: [PATCH 04/10] refactor: mv parser & parser, add more args --- evaluation/__init__.py | 12 ++++ evaluation/eval.py | 68 +++++++++---------- evaluation/tasks/auto_task.py | 19 +++--- .../tydiqa_secondary/tydiqa_secondary.py | 4 +- 4 files changed, 53 insertions(+), 50 deletions(-) diff --git a/evaluation/__init__.py b/evaluation/__init__.py index e69de29..92c57e9 100644 --- a/evaluation/__init__.py +++ b/evaluation/__init__.py @@ -0,0 +1,12 @@ +import logging + + +logger = logging.getLogger(__name__) +formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt="%m/%d/%Y %H:%M:%S", +) +handler = logging.StreamHandler() +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.setLevel(logging.INFO) \ No newline at end of file diff --git a/evaluation/eval.py b/evaluation/eval.py index 6bc24ed..31b88b2 100644 --- a/evaluation/eval.py +++ b/evaluation/eval.py @@ -1,4 +1,3 @@ -import logging from dataclasses import dataclass, field from datetime import datetime from typing import Optional, List @@ -13,11 +12,7 @@ ) import evaluation.tasks # needed for AutoTask.__subclass__() to work correctly from evaluation.tasks.auto_task import AutoTask - - -logger = logging.getLogger(__name__) - -torch_device = "cuda" if torch.cuda.is_available() else "cpu" +from evaluation import logger @dataclass @@ -37,11 +32,15 @@ class EvaluationArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name."} ) - output_dir: Optional[str] = field( + output_dir: str = field( default="outputs", metadata={"help": "Directory for saving evaluation outputs."} ) - random_seed: Optional[int] = field( + tag: Optional[str] = field( + default=None, + metadata={"help": "Identifier for the evaluation run."} + ) + random_seed: int = field( default=24, metadata={"help": "Customized random seed"} ) @@ -49,53 +48,48 @@ class EvaluationArguments: default=None, metadata={"help": "A list of tasks to run the evaluation on, e.g. tydiqa_secondary"} ) - - -def main(): - # parse arguments - parser = HfArgumentParser(EvaluationArguments) - eval_args, = parser.parse_args_into_dataclasses() - - # set up logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", + device: str = field( + default="cuda", + metadata={"help": "Device on which to run evaluation"} ) - logger.setLevel(logging.INFO) - # set random seed - set_seed(eval_args.random_seed) - if not eval_args.eval_tasks: +def main(args): + if not args.eval_tasks: raise ValueError('Must provide at least one eval task!') - + logger.info("Beginning evaluation") + # set random seed + set_seed(args.random_seed) + + # initialize device + device = torch.device(args.device) + # Load model & tokenizer logger.info("Loading model...") - tokenizer = AutoTokenizer.from_pretrained(eval_args.tokenizer_name or eval_args.model_name_or_path) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name or args.model_name_or_path) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" - model = AutoModelForCausalLM.from_pretrained(eval_args.model_name_or_path, pad_token_id=tokenizer.eos_token) + model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, pad_token_id=tokenizer.eos_token) model.config.pad_token_id = model.config.eos_token_id model.resize_token_embeddings(len(tokenizer)) - model.to(torch_device) + model.to(device) # Exporting results - output_dir = None - if eval_args.output_dir: - output_dir = os.path.join(eval_args.output_dir, datetime.now().strftime("%y%m%d_%H%M%S")) - os.makedirs(output_dir, exist_ok=True) + tag = args.tag or datetime.now().strftime("%y%m%d_%H%M%S") + output_dir = os.path.join(args.output_dir, tag) + os.makedirs(output_dir, exist_ok=True) - for eval_task in eval_args.eval_tasks: + for eval_task in args.eval_tasks: logger.info(f"Benchmarking {eval_task}...") - task = AutoTask.from_task_name(eval_task, tokenizer=tokenizer, model=model) + task = AutoTask.from_task_name(eval_task, tokenizer=tokenizer, model=model, device=device) task.evaluate() - - if output_dir: - task.save_metrics(output_dir, logger) + task.save_metrics(output_dir, logger) if __name__ == "__main__": - main() + parser = HfArgumentParser(EvaluationArguments) + args, = parser.parse_args_into_dataclasses() + main(args) diff --git a/evaluation/tasks/auto_task.py b/evaluation/tasks/auto_task.py index 54957de..dfa6f3b 100644 --- a/evaluation/tasks/auto_task.py +++ b/evaluation/tasks/auto_task.py @@ -1,27 +1,24 @@ from abc import ABC, abstractmethod import os -import torch - from evaluation.utils.io import save_json class AutoTask(ABC): - def __init__(self, tokenizer, model): + def __init__(self, tokenizer, model, device): self.tokenizer = tokenizer self.model = model - self.torch_device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device self.metrics = {} @classmethod - def from_task_name(cls, task_name: str, tokenizer, model): + def from_task_name(cls, task_name: str, tokenizer, model, device): all_tasks = cls.__subclasses__() - matched_task = [task for task in all_tasks if task.get_display_name() == task_name] - - if not matched_task: - raise ValueError(f'Invalid task: {task_name}') - - return matched_task[0](tokenizer=tokenizer, model=model) + for task in all_tasks: + if task.get_display_name() == task_name: + return task(tokenizer=tokenizer, model=model, device=device) + + raise ValueError(f'Invalid task: {task_name}') @staticmethod @abstractmethod diff --git a/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py b/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py index 90e9d8e..5ce01ea 100644 --- a/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py +++ b/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py @@ -78,8 +78,8 @@ def evaluate(self) -> None: substring_matches = 0 for sample in tqdm(dataset, desc=f'Evaluating {self.get_display_name()}'): output = self.model.generate( - input_ids=sample["input_ids"].to(self.torch_device), - attention_mask=sample["attention_mask"].to(self.torch_device), + input_ids=sample["input_ids"].to(self.device), + attention_mask=sample["attention_mask"].to(self.device), max_length=min(sample["input_len"] * 2, self.model.config.n_positions), ) From 4491b529ee6cdc07017eca39e8dca86346245af3 Mon Sep 17 00:00:00 2001 From: Jake Tae Date: Sun, 15 Aug 2021 20:48:55 +0900 Subject: [PATCH 05/10] refactor: mv `load_dataset` call into Dataset cls --- evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py b/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py index 5ce01ea..7d97e42 100644 --- a/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py +++ b/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py @@ -26,11 +26,12 @@ class TyDiQADataset(Dataset): - def __init__(self, data, tokenizer, target_langs): - super(TyDiQADataset, self).__init__() + def __init__(self, tokenizer, target_langs): + super().__init__() + tydiqa = load_dataset("tydiqa", "secondary_task", split="validation") self.items = [] - for sample_id, sample in enumerate(data): + for sample in tydiqa: lang = sample["id"].split("-")[0] if lang in target_langs: # Filter out samples in languages that are not used during training @@ -71,9 +72,7 @@ def get_display_name() -> str: return 'tydiqa_secondary' def evaluate(self) -> None: - target_langs = ["english"] - data = load_dataset("tydiqa", "secondary_task", split="validation") - dataset = TyDiQADataset(data, self.tokenizer, target_langs) + dataset = TyDiQADataset(self.tokenizer, target_langs=["english"]) substring_matches = 0 for sample in tqdm(dataset, desc=f'Evaluating {self.get_display_name()}'): From 13c084e2c50dbdca15ab60dc01e729f39aa4ba26 Mon Sep 17 00:00:00 2001 From: Jake Tae Date: Sun, 15 Aug 2021 21:06:36 +0900 Subject: [PATCH 06/10] refactor: mv logger to utils --- evaluation/__init__.py | 12 ------------ evaluation/eval.py | 11 +++++------ evaluation/utils/log.py | 14 ++++++++++++++ 3 files changed, 19 insertions(+), 18 deletions(-) create mode 100644 evaluation/utils/log.py diff --git a/evaluation/__init__.py b/evaluation/__init__.py index 92c57e9..e69de29 100644 --- a/evaluation/__init__.py +++ b/evaluation/__init__.py @@ -1,12 +0,0 @@ -import logging - - -logger = logging.getLogger(__name__) -formatter = logging.Formatter( - '%(asctime)s - %(name)s - %(levelname)s - %(message)s', - datefmt="%m/%d/%Y %H:%M:%S", -) -handler = logging.StreamHandler() -handler.setFormatter(formatter) -logger.addHandler(handler) -logger.setLevel(logging.INFO) \ No newline at end of file diff --git a/evaluation/eval.py b/evaluation/eval.py index 31b88b2..23b5818 100644 --- a/evaluation/eval.py +++ b/evaluation/eval.py @@ -12,7 +12,7 @@ ) import evaluation.tasks # needed for AutoTask.__subclass__() to work correctly from evaluation.tasks.auto_task import AutoTask -from evaluation import logger +from evaluation.utils.log import get_logger @dataclass @@ -58,14 +58,12 @@ def main(args): if not args.eval_tasks: raise ValueError('Must provide at least one eval task!') - logger.info("Beginning evaluation") - - # set random seed - set_seed(args.random_seed) - # initialize device device = torch.device(args.device) + logger = get_logger() + logger.info(f"Beginning evaluation on device {args.device}") + # Load model & tokenizer logger.info("Loading model...") tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name or args.model_name_or_path) @@ -85,6 +83,7 @@ def main(args): for eval_task in args.eval_tasks: logger.info(f"Benchmarking {eval_task}...") task = AutoTask.from_task_name(eval_task, tokenizer=tokenizer, model=model, device=device) + set_seed(args.random_seed) task.evaluate() task.save_metrics(output_dir, logger) diff --git a/evaluation/utils/log.py b/evaluation/utils/log.py new file mode 100644 index 0000000..63854c8 --- /dev/null +++ b/evaluation/utils/log.py @@ -0,0 +1,14 @@ +import logging + + +def get_logger(): + logger = logging.getLogger("evaluation") + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt="%m/%d/%Y %H:%M:%S", + ) + handler = logging.StreamHandler() + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.INFO) + return logger \ No newline at end of file From f259b2565646f524693770fb731b7699a38615ef Mon Sep 17 00:00:00 2001 From: Jake Tae Date: Sun, 15 Aug 2021 21:07:11 +0900 Subject: [PATCH 07/10] style: run formatter --- evaluation/eval.py | 63 ++++++++++--------- evaluation/tasks/auto_task.py | 10 +-- .../tasks/tydiqa_primary/tydiqa_primary.py | 19 +++--- .../tydiqa_secondary/tydiqa_secondary.py | 35 ++++++----- evaluation/utils/log.py | 4 +- 5 files changed, 73 insertions(+), 58 deletions(-) diff --git a/evaluation/eval.py b/evaluation/eval.py index 23b5818..ecbdd9b 100644 --- a/evaluation/eval.py +++ b/evaluation/eval.py @@ -1,15 +1,11 @@ +import os from dataclasses import dataclass, field from datetime import datetime -from typing import Optional, List -import os +from typing import List, Optional import torch -from transformers import ( - HfArgumentParser, - AutoTokenizer, - AutoModelForCausalLM, - set_seed, -) +from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, set_seed + import evaluation.tasks # needed for AutoTask.__subclass__() to work correctly from evaluation.tasks.auto_task import AutoTask from evaluation.utils.log import get_logger @@ -18,46 +14,49 @@ @dataclass class EvaluationArguments: """ - Arguments for any adjustable params in this evaluation script + Arguments for any adjustable params in this evaluation script """ + model_name_or_path: Optional[str] = field( default=None, - metadata={"help": "The model checkpoint that we want to evaluate, could be name or the path."} + metadata={ + "help": "The model checkpoint that we want to evaluate, could be name or the path." + }, ) config_name: Optional[str] = field( default=None, - metadata={"help": "Pretrained config name or path if not the same as model_name."} + metadata={ + "help": "Pretrained config name or path if not the same as model_name." + }, ) tokenizer_name: Optional[str] = field( default=None, - metadata={"help": "Pretrained tokenizer name or path if not the same as model_name."} + metadata={ + "help": "Pretrained tokenizer name or path if not the same as model_name." + }, ) output_dir: str = field( - default="outputs", - metadata={"help": "Directory for saving evaluation outputs."} + default="outputs", metadata={"help": "Directory for saving evaluation outputs."} ) tag: Optional[str] = field( - default=None, - metadata={"help": "Identifier for the evaluation run."} - ) - random_seed: int = field( - default=24, - metadata={"help": "Customized random seed"} + default=None, metadata={"help": "Identifier for the evaluation run."} ) + random_seed: int = field(default=24, metadata={"help": "Customized random seed"}) eval_tasks: Optional[List[str]] = field( default=None, - metadata={"help": "A list of tasks to run the evaluation on, e.g. tydiqa_secondary"} + metadata={ + "help": "A list of tasks to run the evaluation on, e.g. tydiqa_secondary" + }, ) device: str = field( - default="cuda", - metadata={"help": "Device on which to run evaluation"} + default="cuda", metadata={"help": "Device on which to run evaluation"} ) def main(args): if not args.eval_tasks: - raise ValueError('Must provide at least one eval task!') - + raise ValueError("Must provide at least one eval task!") + # initialize device device = torch.device(args.device) @@ -66,11 +65,15 @@ def main(args): # Load model & tokenizer logger.info("Loading model...") - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name or args.model_name_or_path) + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer_name or args.model_name_or_path + ) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" - model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, pad_token_id=tokenizer.eos_token) + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, pad_token_id=tokenizer.eos_token + ) model.config.pad_token_id = model.config.eos_token_id model.resize_token_embeddings(len(tokenizer)) model.to(device) @@ -82,7 +85,9 @@ def main(args): for eval_task in args.eval_tasks: logger.info(f"Benchmarking {eval_task}...") - task = AutoTask.from_task_name(eval_task, tokenizer=tokenizer, model=model, device=device) + task = AutoTask.from_task_name( + eval_task, tokenizer=tokenizer, model=model, device=device + ) set_seed(args.random_seed) task.evaluate() task.save_metrics(output_dir, logger) @@ -90,5 +95,5 @@ def main(args): if __name__ == "__main__": parser = HfArgumentParser(EvaluationArguments) - args, = parser.parse_args_into_dataclasses() + (args,) = parser.parse_args_into_dataclasses() main(args) diff --git a/evaluation/tasks/auto_task.py b/evaluation/tasks/auto_task.py index dfa6f3b..8df39e2 100644 --- a/evaluation/tasks/auto_task.py +++ b/evaluation/tasks/auto_task.py @@ -1,5 +1,5 @@ -from abc import ABC, abstractmethod import os +from abc import ABC, abstractmethod from evaluation.utils.io import save_json @@ -17,8 +17,8 @@ def from_task_name(cls, task_name: str, tokenizer, model, device): for task in all_tasks: if task.get_display_name() == task_name: return task(tokenizer=tokenizer, model=model, device=device) - - raise ValueError(f'Invalid task: {task_name}') + + raise ValueError(f"Invalid task: {task_name}") @staticmethod @abstractmethod @@ -34,5 +34,7 @@ def save_metrics(self, output_dir, logger=None) -> str: save_json(self.metrics, output_filename) if logger: - logger.info(f"{self.get_display_name()}: result exported to {output_filename}") + logger.info( + f"{self.get_display_name()}: result exported to {output_filename}" + ) return output_filename diff --git a/evaluation/tasks/tydiqa_primary/tydiqa_primary.py b/evaluation/tasks/tydiqa_primary/tydiqa_primary.py index be6838b..ee5c6a0 100644 --- a/evaluation/tasks/tydiqa_primary/tydiqa_primary.py +++ b/evaluation/tasks/tydiqa_primary/tydiqa_primary.py @@ -19,19 +19,20 @@ """ ) + class TyDiQADataset(Dataset): def __init__(self, data, tokenizer, target_langs): super(TyDiQADataset, self).__init__() self.items = [] - + for sample_id, sample in enumerate(data): lang = sample["id"].split("-")[0] if lang in target_langs: # Filter out samples in languages that are not used during training prompt = TEMPLATE.render( - id = sample["id"], - context = sample["context"], - question = sample["question"], + id=sample["id"], + context=sample["context"], + question=sample["question"], ) prompt = prompt.strip() # Remove trailing white space and newline @@ -39,7 +40,7 @@ def __init__(self, data, tokenizer, target_langs): inputs = tokenizer( prompt, padding=True, - return_tensors='pt', + return_tensors="pt", ) self.items.append( { @@ -48,12 +49,14 @@ def __init__(self, data, tokenizer, target_langs): "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "input_len": inputs["attention_mask"].shape[1], - "target_answer": [ans.lower() for ans in sample["answers"]['text']], + "target_answer": [ + ans.lower() for ans in sample["answers"]["text"] + ], } ) - + def __len__(self): return len(self.items) - + def __getitem__(self, index): return self.items[index] diff --git a/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py b/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py index 7d97e42..9d99902 100644 --- a/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py +++ b/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py @@ -2,9 +2,9 @@ # HuggingFace dataset link: https://huggingface.co/datasets/tydiqa from typing import Dict +from datasets import load_dataset from jinja2 import Template from torch.utils.data import Dataset -from datasets import load_dataset from tqdm import tqdm from evaluation.tasks.auto_task import AutoTask @@ -30,15 +30,15 @@ def __init__(self, tokenizer, target_langs): super().__init__() tydiqa = load_dataset("tydiqa", "secondary_task", split="validation") self.items = [] - + for sample in tydiqa: lang = sample["id"].split("-")[0] if lang in target_langs: # Filter out samples in languages that are not used during training prompt = TEMPLATE.render( - id = sample["id"], - context = sample["context"], - question = sample["question"], + id=sample["id"], + context=sample["context"], + question=sample["question"], ) prompt = prompt.strip() # Remove trailing white space and newline @@ -46,7 +46,7 @@ def __init__(self, tokenizer, target_langs): inputs = tokenizer( prompt, padding=True, - return_tensors='pt', + return_tensors="pt", ) self.items.append( { @@ -55,13 +55,15 @@ def __init__(self, tokenizer, target_langs): "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "input_len": inputs["attention_mask"].shape[1], - "target_answer": [ans.lower() for ans in sample["answers"]['text']], + "target_answer": [ + ans.lower() for ans in sample["answers"]["text"] + ], } ) - + def __len__(self): return len(self.items) - + def __getitem__(self, index): return self.items[index] @@ -69,13 +71,13 @@ def __getitem__(self, index): class TydiqaSecondaryTask(AutoTask): @staticmethod def get_display_name() -> str: - return 'tydiqa_secondary' + return "tydiqa_secondary" def evaluate(self) -> None: dataset = TyDiQADataset(self.tokenizer, target_langs=["english"]) substring_matches = 0 - for sample in tqdm(dataset, desc=f'Evaluating {self.get_display_name()}'): + for sample in tqdm(dataset, desc=f"Evaluating {self.get_display_name()}"): output = self.model.generate( input_ids=sample["input_ids"].to(self.device), attention_mask=sample["attention_mask"].to(self.device), @@ -87,9 +89,12 @@ def evaluate(self) -> None: predicted_answer = decoded_output[prompt_len:] target_answers = sample["target_answer"] - substring_match = any([target_answer in predicted_answer.lower() for target_answer in target_answers]) + substring_match = any( + [ + target_answer in predicted_answer.lower() + for target_answer in target_answers + ] + ) substring_matches += substring_match - self.metrics = { - "substring_matches": substring_matches / len(dataset) * 100 - } + self.metrics = {"substring_matches": substring_matches / len(dataset) * 100} diff --git a/evaluation/utils/log.py b/evaluation/utils/log.py index 63854c8..6e1c83e 100644 --- a/evaluation/utils/log.py +++ b/evaluation/utils/log.py @@ -4,11 +4,11 @@ def get_logger(): logger = logging.getLogger("evaluation") formatter = logging.Formatter( - '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", ) handler = logging.StreamHandler() handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) - return logger \ No newline at end of file + return logger From 801584bf448c21437bce2847d4877b6c8f44edde Mon Sep 17 00:00:00 2001 From: Jake Tae Date: Mon, 16 Aug 2021 04:58:40 +0900 Subject: [PATCH 08/10] Revert "style: run formatter" This reverts commit f259b2565646f524693770fb731b7699a38615ef. --- evaluation/eval.py | 63 +++++++++---------- evaluation/tasks/auto_task.py | 10 ++- .../tasks/tydiqa_primary/tydiqa_primary.py | 19 +++--- .../tydiqa_secondary/tydiqa_secondary.py | 35 +++++------ evaluation/utils/log.py | 4 +- 5 files changed, 58 insertions(+), 73 deletions(-) diff --git a/evaluation/eval.py b/evaluation/eval.py index ecbdd9b..23b5818 100644 --- a/evaluation/eval.py +++ b/evaluation/eval.py @@ -1,11 +1,15 @@ -import os from dataclasses import dataclass, field from datetime import datetime -from typing import List, Optional +from typing import Optional, List +import os import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, set_seed - +from transformers import ( + HfArgumentParser, + AutoTokenizer, + AutoModelForCausalLM, + set_seed, +) import evaluation.tasks # needed for AutoTask.__subclass__() to work correctly from evaluation.tasks.auto_task import AutoTask from evaluation.utils.log import get_logger @@ -14,49 +18,46 @@ @dataclass class EvaluationArguments: """ - Arguments for any adjustable params in this evaluation script + Arguments for any adjustable params in this evaluation script """ - model_name_or_path: Optional[str] = field( default=None, - metadata={ - "help": "The model checkpoint that we want to evaluate, could be name or the path." - }, + metadata={"help": "The model checkpoint that we want to evaluate, could be name or the path."} ) config_name: Optional[str] = field( default=None, - metadata={ - "help": "Pretrained config name or path if not the same as model_name." - }, + metadata={"help": "Pretrained config name or path if not the same as model_name."} ) tokenizer_name: Optional[str] = field( default=None, - metadata={ - "help": "Pretrained tokenizer name or path if not the same as model_name." - }, + metadata={"help": "Pretrained tokenizer name or path if not the same as model_name."} ) output_dir: str = field( - default="outputs", metadata={"help": "Directory for saving evaluation outputs."} + default="outputs", + metadata={"help": "Directory for saving evaluation outputs."} ) tag: Optional[str] = field( - default=None, metadata={"help": "Identifier for the evaluation run."} + default=None, + metadata={"help": "Identifier for the evaluation run."} + ) + random_seed: int = field( + default=24, + metadata={"help": "Customized random seed"} ) - random_seed: int = field(default=24, metadata={"help": "Customized random seed"}) eval_tasks: Optional[List[str]] = field( default=None, - metadata={ - "help": "A list of tasks to run the evaluation on, e.g. tydiqa_secondary" - }, + metadata={"help": "A list of tasks to run the evaluation on, e.g. tydiqa_secondary"} ) device: str = field( - default="cuda", metadata={"help": "Device on which to run evaluation"} + default="cuda", + metadata={"help": "Device on which to run evaluation"} ) def main(args): if not args.eval_tasks: - raise ValueError("Must provide at least one eval task!") - + raise ValueError('Must provide at least one eval task!') + # initialize device device = torch.device(args.device) @@ -65,15 +66,11 @@ def main(args): # Load model & tokenizer logger.info("Loading model...") - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer_name or args.model_name_or_path - ) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name or args.model_name_or_path) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" - model = AutoModelForCausalLM.from_pretrained( - args.model_name_or_path, pad_token_id=tokenizer.eos_token - ) + model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, pad_token_id=tokenizer.eos_token) model.config.pad_token_id = model.config.eos_token_id model.resize_token_embeddings(len(tokenizer)) model.to(device) @@ -85,9 +82,7 @@ def main(args): for eval_task in args.eval_tasks: logger.info(f"Benchmarking {eval_task}...") - task = AutoTask.from_task_name( - eval_task, tokenizer=tokenizer, model=model, device=device - ) + task = AutoTask.from_task_name(eval_task, tokenizer=tokenizer, model=model, device=device) set_seed(args.random_seed) task.evaluate() task.save_metrics(output_dir, logger) @@ -95,5 +90,5 @@ def main(args): if __name__ == "__main__": parser = HfArgumentParser(EvaluationArguments) - (args,) = parser.parse_args_into_dataclasses() + args, = parser.parse_args_into_dataclasses() main(args) diff --git a/evaluation/tasks/auto_task.py b/evaluation/tasks/auto_task.py index 8df39e2..dfa6f3b 100644 --- a/evaluation/tasks/auto_task.py +++ b/evaluation/tasks/auto_task.py @@ -1,5 +1,5 @@ -import os from abc import ABC, abstractmethod +import os from evaluation.utils.io import save_json @@ -17,8 +17,8 @@ def from_task_name(cls, task_name: str, tokenizer, model, device): for task in all_tasks: if task.get_display_name() == task_name: return task(tokenizer=tokenizer, model=model, device=device) - - raise ValueError(f"Invalid task: {task_name}") + + raise ValueError(f'Invalid task: {task_name}') @staticmethod @abstractmethod @@ -34,7 +34,5 @@ def save_metrics(self, output_dir, logger=None) -> str: save_json(self.metrics, output_filename) if logger: - logger.info( - f"{self.get_display_name()}: result exported to {output_filename}" - ) + logger.info(f"{self.get_display_name()}: result exported to {output_filename}") return output_filename diff --git a/evaluation/tasks/tydiqa_primary/tydiqa_primary.py b/evaluation/tasks/tydiqa_primary/tydiqa_primary.py index ee5c6a0..be6838b 100644 --- a/evaluation/tasks/tydiqa_primary/tydiqa_primary.py +++ b/evaluation/tasks/tydiqa_primary/tydiqa_primary.py @@ -19,20 +19,19 @@ """ ) - class TyDiQADataset(Dataset): def __init__(self, data, tokenizer, target_langs): super(TyDiQADataset, self).__init__() self.items = [] - + for sample_id, sample in enumerate(data): lang = sample["id"].split("-")[0] if lang in target_langs: # Filter out samples in languages that are not used during training prompt = TEMPLATE.render( - id=sample["id"], - context=sample["context"], - question=sample["question"], + id = sample["id"], + context = sample["context"], + question = sample["question"], ) prompt = prompt.strip() # Remove trailing white space and newline @@ -40,7 +39,7 @@ def __init__(self, data, tokenizer, target_langs): inputs = tokenizer( prompt, padding=True, - return_tensors="pt", + return_tensors='pt', ) self.items.append( { @@ -49,14 +48,12 @@ def __init__(self, data, tokenizer, target_langs): "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "input_len": inputs["attention_mask"].shape[1], - "target_answer": [ - ans.lower() for ans in sample["answers"]["text"] - ], + "target_answer": [ans.lower() for ans in sample["answers"]['text']], } ) - + def __len__(self): return len(self.items) - + def __getitem__(self, index): return self.items[index] diff --git a/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py b/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py index 9d99902..7d97e42 100644 --- a/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py +++ b/evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py @@ -2,9 +2,9 @@ # HuggingFace dataset link: https://huggingface.co/datasets/tydiqa from typing import Dict -from datasets import load_dataset from jinja2 import Template from torch.utils.data import Dataset +from datasets import load_dataset from tqdm import tqdm from evaluation.tasks.auto_task import AutoTask @@ -30,15 +30,15 @@ def __init__(self, tokenizer, target_langs): super().__init__() tydiqa = load_dataset("tydiqa", "secondary_task", split="validation") self.items = [] - + for sample in tydiqa: lang = sample["id"].split("-")[0] if lang in target_langs: # Filter out samples in languages that are not used during training prompt = TEMPLATE.render( - id=sample["id"], - context=sample["context"], - question=sample["question"], + id = sample["id"], + context = sample["context"], + question = sample["question"], ) prompt = prompt.strip() # Remove trailing white space and newline @@ -46,7 +46,7 @@ def __init__(self, tokenizer, target_langs): inputs = tokenizer( prompt, padding=True, - return_tensors="pt", + return_tensors='pt', ) self.items.append( { @@ -55,15 +55,13 @@ def __init__(self, tokenizer, target_langs): "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "input_len": inputs["attention_mask"].shape[1], - "target_answer": [ - ans.lower() for ans in sample["answers"]["text"] - ], + "target_answer": [ans.lower() for ans in sample["answers"]['text']], } ) - + def __len__(self): return len(self.items) - + def __getitem__(self, index): return self.items[index] @@ -71,13 +69,13 @@ def __getitem__(self, index): class TydiqaSecondaryTask(AutoTask): @staticmethod def get_display_name() -> str: - return "tydiqa_secondary" + return 'tydiqa_secondary' def evaluate(self) -> None: dataset = TyDiQADataset(self.tokenizer, target_langs=["english"]) substring_matches = 0 - for sample in tqdm(dataset, desc=f"Evaluating {self.get_display_name()}"): + for sample in tqdm(dataset, desc=f'Evaluating {self.get_display_name()}'): output = self.model.generate( input_ids=sample["input_ids"].to(self.device), attention_mask=sample["attention_mask"].to(self.device), @@ -89,12 +87,9 @@ def evaluate(self) -> None: predicted_answer = decoded_output[prompt_len:] target_answers = sample["target_answer"] - substring_match = any( - [ - target_answer in predicted_answer.lower() - for target_answer in target_answers - ] - ) + substring_match = any([target_answer in predicted_answer.lower() for target_answer in target_answers]) substring_matches += substring_match - self.metrics = {"substring_matches": substring_matches / len(dataset) * 100} + self.metrics = { + "substring_matches": substring_matches / len(dataset) * 100 + } diff --git a/evaluation/utils/log.py b/evaluation/utils/log.py index 6e1c83e..63854c8 100644 --- a/evaluation/utils/log.py +++ b/evaluation/utils/log.py @@ -4,11 +4,11 @@ def get_logger(): logger = logging.getLogger("evaluation") formatter = logging.Formatter( - "%(asctime)s - %(name)s - %(levelname)s - %(message)s", + '%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt="%m/%d/%Y %H:%M:%S", ) handler = logging.StreamHandler() handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) - return logger + return logger \ No newline at end of file From 93d7f218df58bd99018acf6a8879c25c42d666aa Mon Sep 17 00:00:00 2001 From: Jake Tae Date: Mon, 16 Aug 2021 16:31:16 +0900 Subject: [PATCH 09/10] fix: mv parser into `main()` --- evaluation/eval.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/evaluation/eval.py b/evaluation/eval.py index 23b5818..4218147 100644 --- a/evaluation/eval.py +++ b/evaluation/eval.py @@ -54,7 +54,10 @@ class EvaluationArguments: ) -def main(args): +def main(): + parser = HfArgumentParser(EvaluationArguments) + args, = parser.parse_args_into_dataclasses() + if not args.eval_tasks: raise ValueError('Must provide at least one eval task!') @@ -89,6 +92,4 @@ def main(args): if __name__ == "__main__": - parser = HfArgumentParser(EvaluationArguments) - args, = parser.parse_args_into_dataclasses() - main(args) + main() From 78cdd165fba6d0c68b2ce50c8272065b3cc327c4 Mon Sep 17 00:00:00 2001 From: Jake Tae Date: Tue, 17 Aug 2021 06:57:22 +0900 Subject: [PATCH 10/10] feat: use `train_args`, rm duplicate fields --- evaluation/eval.py | 49 ++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/evaluation/eval.py b/evaluation/eval.py index 4218147..053a291 100644 --- a/evaluation/eval.py +++ b/evaluation/eval.py @@ -8,6 +8,7 @@ HfArgumentParser, AutoTokenizer, AutoModelForCausalLM, + TrainingArguments, set_seed, ) import evaluation.tasks # needed for AutoTask.__subclass__() to work correctly @@ -20,10 +21,12 @@ class EvaluationArguments: """ Arguments for any adjustable params in this evaluation script """ - model_name_or_path: Optional[str] = field( - default=None, + model_name_or_path: str = field( metadata={"help": "The model checkpoint that we want to evaluate, could be name or the path."} ) + eval_tasks: List[str] = field( + metadata={"help": "A list of tasks to run the evaluation on, e.g. tydiqa_secondary"} + ) config_name: Optional[str] = field( default=None, metadata={"help": "Pretrained config name or path if not the same as model_name."} @@ -32,61 +35,47 @@ class EvaluationArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name."} ) - output_dir: str = field( - default="outputs", - metadata={"help": "Directory for saving evaluation outputs."} - ) tag: Optional[str] = field( default=None, metadata={"help": "Identifier for the evaluation run."} - ) - random_seed: int = field( - default=24, - metadata={"help": "Customized random seed"} - ) - eval_tasks: Optional[List[str]] = field( - default=None, - metadata={"help": "A list of tasks to run the evaluation on, e.g. tydiqa_secondary"} - ) - device: str = field( - default="cuda", - metadata={"help": "Device on which to run evaluation"} - ) + ) def main(): - parser = HfArgumentParser(EvaluationArguments) - args, = parser.parse_args_into_dataclasses() + parser = HfArgumentParser((EvaluationArguments, TrainingArguments)) + eval_args, train_args = parser.parse_args_into_dataclasses() - if not args.eval_tasks: + if not eval_args.eval_tasks: raise ValueError('Must provide at least one eval task!') # initialize device - device = torch.device(args.device) + device = torch.device(train_args.device) logger = get_logger() - logger.info(f"Beginning evaluation on device {args.device}") + logger.info(f"Beginning evaluation on device {train_args.device}") # Load model & tokenizer logger.info("Loading model...") - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name or args.model_name_or_path) + tokenizer = AutoTokenizer.from_pretrained(eval_args.tokenizer_name or eval_args.model_name_or_path) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" - model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, pad_token_id=tokenizer.eos_token) + model = AutoModelForCausalLM.from_pretrained( + eval_args.model_name_or_path, pad_token_id=tokenizer.eos_token, + ) model.config.pad_token_id = model.config.eos_token_id model.resize_token_embeddings(len(tokenizer)) model.to(device) # Exporting results - tag = args.tag or datetime.now().strftime("%y%m%d_%H%M%S") - output_dir = os.path.join(args.output_dir, tag) + tag = eval_args.tag or datetime.now().strftime("%y%m%d_%H%M%S") + output_dir = os.path.join(train_args.output_dir, tag) os.makedirs(output_dir, exist_ok=True) - for eval_task in args.eval_tasks: + for eval_task in eval_args.eval_tasks: logger.info(f"Benchmarking {eval_task}...") task = AutoTask.from_task_name(eval_task, tokenizer=tokenizer, model=model, device=device) - set_seed(args.random_seed) + set_seed(train_args.seed) task.evaluate() task.save_metrics(output_dir, logger)