From 74bac48453eeed0353451fc1db08572b16662ae3 Mon Sep 17 00:00:00 2001 From: ledong0110 <74060032+ledong0110@users.noreply.github.com> Date: Sun, 4 Aug 2024 00:42:59 +0700 Subject: [PATCH] fix: adapt IR task --- config/vi/dataset_info.json | 399 +++++++++-------- src/vieval/tools/metrics/ir.py | 11 +- src/vieval/tools/metrics/language.py | 2 +- .../tools/pipelines/metric_pipelines.py | 18 +- src/vieval/tools/pipelines/pipelines.py | 418 +++++------------- 5 files changed, 320 insertions(+), 528 deletions(-) diff --git a/config/vi/dataset_info.json b/config/vi/dataset_info.json index 532f2eb..9121c1c 100644 --- a/config/vi/dataset_info.json +++ b/config/vi/dataset_info.json @@ -1,196 +1,209 @@ { - "xquad_xtreme": { - "hf_hub_url": "juletxara/xquad_xtreme", - "subset": "vi", - "train_split": "translate_train", - "test_split": "test", - "task": "question-answering", - "prompting_strategy": 0, - "columns": { - "context": "context", - "query": "question", - "answer": "answers" - } - }, - "mlqa": { - "hf_hub_url": "mlqa", - "subset": "mlqa.vi.vi", - "train_split": "validation", - "test_split": "test", - "task": "question-answering", - "prompting_strategy": 0, - "columns": { - "context": "context", - "query": "question", - "answer": "answers" - } - }, - "vietnews": { - "hf_hub_url": "Yuhthe/vietnews", - "task": "summarization", - "prompting_strategy": 0, - "columns": { - "source": "article", - "target": "abstract" - } - }, - "wiki_lingua": { - "hf_hub_url": "GEM/wiki_lingua", - "subset": "vi", - "task": "summarization", - "prompting_strategy": 0 - }, - "UIT-VSFC": { - "file_name": "UIT-VSFC", - "task": "sentiment-analysis", - "prompting_strategy": 0, - "label": [0, 1, 2], - "columns": { - "query": "text", - "answer": "label" - } - }, - "ViHSD": { - "file_name": "ViHSD", - "task": "toxicity-detection", - "prompting_strategy": 1, - "label": [0, 1, 2], - "columns": { - "query": "free_text", - "answer": "label_id" - } - }, - "UIT-VSMEC": { - "file_name": "UIT-VSMEC", - "task": "text-classification", - "prompting_strategy": 0, - "label": [0, 1, 2, 3, 4, 5, 6], - "columns": { - "query": "Sentence", - "answer": "Label" - } - }, - "ViMMRC": { - "file_name": "ViMMRC", - "task": "knowledge-mtpchoice", - "prompting_strategy": 0, - "label": ["A", "B", "C", "D"], - "columns": { - "context": "article", - "query": "question", - "options": "options", - "answer": "answer" - - } - }, - "ViMMRC_random": { - "file_name": "ViMMRC", - "task": "knowledge-mtpchoice", - "prompting_strategy": 0, - "label": ["A", "B", "C", "D"], - "random": true, - "columns": { - "context": "article", - "query": "question", - "options": "options", - "answer": "answer" - - } - }, - "zalo_e2eqa": { - "file_name": "zalo_e2eqa", - "task": "knowledge-openended", - "prompting_strategy": 0, - "columns": { - "query": "question", - "answer": "answers" - } - }, - "zalo_e2eqa_fairness": { - "file_name": "zalo_e2eqa_fairness", - "task": "knowledge-openended", - "prompting_strategy": 0, - "columns": { - "query": "question", - "answer": "answers" - } - }, - "zalo_e2eqa_robustness": { - "file_name": "zalo_e2eqa_robustness", - "task": "knowledge-openended", - "prompting_strategy": 0, - "columns": { - "query": "question", - "answer": "answers" - } - }, - "VSEC": { - "file_name": "VSEC", - "task": "language-modeling", - "prompting_strategy": 1, - "columns": { - "source": "text", - "target": "correct" - } - }, - "synthetic_natural_azr": { - "hf_hub_url": "ura-hcmut/synthetic_reasoning_natural", - "subset": "easy_azr", - "task": "reasoning", - "prompting_strategy": 0, - "columns": { - "query": "problem", - "answer": "target" - } - }, - "synthetic_abstract": { - "hf_hub_url": "ura-hcmut/synthetic_reasoning", - "subset": "easy_azr", - "task": "reasoning", - "prompting_strategy": 0, - "columns": { - "query": "source", - "answer": "target" - } - }, - "math_level1_azr": { - "hf_hub_url": "ura-hcmut/MATH_Level_1", - "subset": "azr", - "task": "math", - "prompting_strategy": 0, - "columns": { - "type_id": "type", - "query": "problem", - "answer": "short_solution" - } - }, - "math_level1_cot_azr": { - "hf_hub_url": "ura-hcmut/MATH_Level_1", - "subset": "azr", - "task": "math", - "prompting_strategy": 0, - "columns": { - "type_id": "type", - "query": "problem", - "answer": "solution" - } - }, - "opus100_envi": { - "hf_hub_url": "vietgpt/opus100_envi", - "task": "translation", - "prompting_strategy": 0, - "columns": { - "source": "en", - "target": "vi" - } - }, - "opus100_vien": { - "hf_hub_url": "vietgpt/opus100_envi", - "task": "translation", - "prompting_strategy": 0, - "columns": { - "source": "vi", - "target": "en" - } + "xquad_xtreme": { + "hf_hub_url": "juletxara/xquad_xtreme", + "subset": "vi", + "train_split": "translate_train", + "test_split": "test", + "task": "question-answering", + "prompting_strategy": 0, + "columns": { + "context": "context", + "query": "question", + "answer": "answers" } + }, + "mlqa": { + "hf_hub_url": "mlqa", + "subset": "mlqa.vi.vi", + "train_split": "validation", + "test_split": "test", + "task": "question-answering", + "prompting_strategy": 0, + "columns": { + "context": "context", + "query": "question", + "answer": "answers" + } + }, + "vietnews": { + "hf_hub_url": "Yuhthe/vietnews", + "task": "summarization", + "prompting_strategy": 0, + "columns": { + "source": "article", + "target": "abstract" + } + }, + "wiki_lingua": { + "hf_hub_url": "GEM/wiki_lingua", + "subset": "vi", + "task": "summarization", + "prompting_strategy": 0 + }, + "UIT-VSFC": { + "file_name": "UIT-VSFC", + "task": "sentiment-analysis", + "prompting_strategy": 0, + "label": [0, 1, 2], + "columns": { + "query": "text", + "answer": "label" + } + }, + "ViHSD": { + "file_name": "ViHSD", + "task": "toxicity-detection", + "prompting_strategy": 1, + "label": [0, 1, 2], + "columns": { + "query": "free_text", + "answer": "label_id" + } + }, + "UIT-VSMEC": { + "file_name": "UIT-VSMEC", + "task": "text-classification", + "prompting_strategy": 0, + "label": [0, 1, 2, 3, 4, 5, 6], + "columns": { + "query": "Sentence", + "answer": "Label" + } + }, + "ViMMRC": { + "file_name": "ViMMRC", + "task": "knowledge-mtpchoice", + "prompting_strategy": 0, + "label": ["A", "B", "C", "D"], + "columns": { + "context": "article", + "query": "question", + "options": "options", + "answer": "answer" + + } + }, + "ViMMRC_random": { + "file_name": "ViMMRC", + "task": "knowledge-mtpchoice", + "prompting_strategy": 0, + "label": ["A", "B", "C", "D"], + "random": true, + "columns": { + "context": "article", + "query": "question", + "options": "options", + "answer": "answer" + + } + }, + "mmarco": { + "file_name": "mmarco", + "task": "information-retrieval", + "prompting_strategy": 0, + "label": ["Yes", "No"], + "columns": { + "type_id": "id", + "passages": "passages", + "query": "query", + "answer": "references" + + } + }, + "zalo_e2eqa": { + "file_name": "zalo_e2eqa", + "task": "knowledge-openended", + "prompting_strategy": 0, + "columns": { + "query": "question", + "answer": "answers" + } + }, + "zalo_e2eqa_fairness": { + "file_name": "zalo_e2eqa_fairness", + "task": "knowledge-openended", + "prompting_strategy": 0, + "columns": { + "query": "question", + "answer": "answers" + } + }, + "zalo_e2eqa_robustness": { + "file_name": "zalo_e2eqa_robustness", + "task": "knowledge-openended", + "prompting_strategy": 0, + "columns": { + "query": "question", + "answer": "answers" + } + }, + "VSEC": { + "file_name": "VSEC", + "task": "language-modeling", + "prompting_strategy": 1, + "columns": { + "source": "text", + "target": "correct" + } + }, + "synthetic_natural_azr": { + "hf_hub_url": "ura-hcmut/synthetic_reasoning_natural", + "subset": "easy_azr", + "task": "reasoning", + "prompting_strategy": 0, + "columns": { + "query": "problem", + "answer": "target" + } + }, + "synthetic_abstract": { + "hf_hub_url": "ura-hcmut/synthetic_reasoning", + "subset": "easy_azr", + "task": "reasoning", + "prompting_strategy": 0, + "columns": { + "query": "source", + "answer": "target" + } + }, + "math_level1_azr": { + "hf_hub_url": "ura-hcmut/MATH_Level_1", + "subset": "azr", + "task": "math", + "prompting_strategy": 0, + "columns": { + "type_id": "type", + "query": "problem", + "answer": "short_solution" + } + }, + "math_level1_cot_azr": { + "hf_hub_url": "ura-hcmut/MATH_Level_1", + "subset": "azr", + "task": "math", + "prompting_strategy": 0, + "columns": { + "type_id": "type", + "query": "problem", + "answer": "solution" + } + }, + "opus100_envi": { + "hf_hub_url": "vietgpt/opus100_envi", + "task": "translation", + "prompting_strategy": 0, + "columns": { + "source": "en", + "target": "vi" + } + }, + "opus100_vien": { + "hf_hub_url": "vietgpt/opus100_envi", + "task": "translation", + "prompting_strategy": 0, + "columns": { + "source": "vi", + "target": "en" + } + } } \ No newline at end of file diff --git a/src/vieval/tools/metrics/ir.py b/src/vieval/tools/metrics/ir.py index 0cb054e..5e424dc 100644 --- a/src/vieval/tools/metrics/ir.py +++ b/src/vieval/tools/metrics/ir.py @@ -85,14 +85,9 @@ def evaluate(self, data: Dict, args, **kwargs) -> (Dict, Dict): data (Dict): A dictionary containing predictions to be evaluated. """ result = {} - if "mmarco" in args.filepath: - refenreces = load_dataset("json", data_files="./mmarco.json", split="train") - else: - refenreces = load_dataset( - "json", data_files="./mrobust.json", split="train" - ) - - predictions = data["prediction"] + + refenreces = kwargs["ref_dataset"] + predictions = data["predictions"] qrels = self._get_qrel(refenreces) diff --git a/src/vieval/tools/metrics/language.py b/src/vieval/tools/metrics/language.py index e676e78..4499a8c 100644 --- a/src/vieval/tools/metrics/language.py +++ b/src/vieval/tools/metrics/language.py @@ -11,7 +11,7 @@ class LanguageMetric(BaseMetric): """Evaluate language generation tasks.""" - def __init__(self) -> None: + def __init__(self, data, args) -> None: self.cer_metrics = evaluate.load("cer") self.wer_metrics = evaluate.load("wer") super().__init__(data, args) diff --git a/src/vieval/tools/pipelines/metric_pipelines.py b/src/vieval/tools/pipelines/metric_pipelines.py index 268bd84..42f0492 100644 --- a/src/vieval/tools/pipelines/metric_pipelines.py +++ b/src/vieval/tools/pipelines/metric_pipelines.py @@ -19,7 +19,7 @@ class MetricPipeline: def __init__(self): - + self.metric_classes = { "question-answering": [QAMetric, BiasMetric, ToxicityMetric], "summarization": [SummaryMetric, BiasMetric, ToxicityMetric], @@ -45,21 +45,17 @@ def _load_metrics(self, data, task_name, answer_key, class_names, args): return obj_lst - def run_mean( - self, data, task_name: str, answer_key: str, class_names: List, args - ) -> Dict: + def run_mean(self, data, task_name: str, answer_key: str, class_names: List, args, **kwargs) -> Dict: metric_lst = self._load_metrics(data, task_name, answer_key, class_names, args) result = {} for metric in metric_lst: - _, metric_result = metric.evaluate(data, args) + _, metric_result = metric.evaluate(data, args, **kwargs) result.update(metric_result) return result - def run_std( - self, data, task_name: str, answer_key: str, class_names: List, args - ) -> Dict: - result_lst = self._run_bootrap(data, task_name, answer_key, class_names, args) + def run_std(self, data, task_name: str, answer_key: str, class_names: List, args, **kwargs) -> Dict: + result_lst = self._run_bootrap(data, task_name, answer_key, class_names, args, **kwargs) final_result = self._get_std(result_lst) return final_result @@ -88,7 +84,7 @@ def _get_subdata(self, data: Dict, n: int, indices) -> Dict: return sub_data - def _run_bootrap(self, data, task_name, answer_key, class_names, args) -> Dict: + def _run_bootrap(self, data, task_name, answer_key, class_names, args, **kwargs) -> Dict: n_data = len( data["predictions"] ) # if 'predictions' in data else len(data['prediction']) @@ -100,7 +96,7 @@ def _run_bootrap(self, data, task_name, answer_key, class_names, args) -> Dict: ) print(n_data, len(indices)) sub_data = self._get_subdata(data, n_data, indices) - result = self.run_mean(sub_data, task_name, answer_key, class_names, args) + result = self.run_mean(sub_data, task_name, answer_key, class_names, args, **kwargs) results_lst.append(result) return results_lst diff --git a/src/vieval/tools/pipelines/pipelines.py b/src/vieval/tools/pipelines/pipelines.py index 77fedef..7badb7d 100644 --- a/src/vieval/tools/pipelines/pipelines.py +++ b/src/vieval/tools/pipelines/pipelines.py @@ -13,21 +13,17 @@ class EvalPipeline: def __init__(self, task, config): # Load generation configuration - with open( - os.path.join(config.config_dir, config.lang, "generation_config.json"), "r" - ) as f: + with open(os.path.join(config.config_dir, config.lang, "generation_config.json"), "r") as f: GenerationConfig = json.load(f) - with open(os.path.join(config.config_dir, "llm_template.json"), "r") as f: + with open(os.path.join(config.config_dir, config.lang, "llm_template.json"), "r") as f: LLM_TEMPLATE = json.load(f) - with open( - os.path.join(config.config_dir, config.lang, "metric_configuration.json"), - "r", - ) as f: + with open(os.path.join(config.config_dir, config.lang, "metric_configuration.json"), "r") as f: METRIC_CONFIG = json.load(f) # Load task self.task_name = task + # Load pipelines # print(config.tgi) @@ -82,7 +78,9 @@ def __call__(self, ds_wrapper, ds_loader, saving_fn, start_idx=0): elif "translation" in task: return self.__translation(ds_wrapper, ds_loader, saving_fn, start_idx) elif "language-modeling" in task: - return self.__language_modeling(ds_wrapper, ds_loader, saving_fn, start_idx) + return self.__language_modeling( + ds_wrapper, ds_loader, saving_fn, start_idx + ) elif "text-classification" in task: return self.__multiple_choice_text_classification( ds_wrapper, ds_loader, saving_fn, start_idx @@ -126,16 +124,10 @@ def __question_answering(self, ds_wrapper, ds_loader, saving_fn, start_idx=0): if self.few_shot: def preprocessing_a_record(rec): - return [ - rec[ds_wrapper.dataset_info.context], - rec[ds_wrapper.dataset_info.query], - rec[ds_wrapper.dataset_info.answer]["text"][0], - ] + return [rec[ds_wrapper.dataset_info.context], rec[ds_wrapper.dataset_info.query], rec[ds_wrapper.dataset_info.answer]["text"][0]] selected_sample_idx = list( - random.sample( - range(len(ds_wrapper.dataset_training)), self.config.num_fs - ) + random.sample(range(len(ds_wrapper.dataset_training)), self.config.num_fs) ) selected_sample = [ preprocessing_a_record(ds_wrapper.dataset_training[s]) @@ -164,17 +156,12 @@ def preprocessing_a_record(rec): ), }, ] - for c, q in zip( - batch[ds_wrapper.dataset_info.context], - batch[ds_wrapper.dataset_info.query], - ) + for c, q in zip(batch[ds_wrapper.dataset_info.context], batch[ds_wrapper.dataset_info.query]) ] results, logprobs, _ = self.infer_pipeline(prompts, return_probs=True) predictions.extend(results) - references.extend( - [x[0] for x in batch[ds_wrapper.dataset_info.answer]["text"]] - ) + references.extend([x[0] for x in batch[ds_wrapper.dataset_info.answer]["text"]]) generation_probs.extend(logprobs) idx += 1 @@ -188,11 +175,7 @@ def preprocessing_a_record(rec): } saving_fn(generations) mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) print(f"Results of {idx} batches: ", mean_result) @@ -203,18 +186,10 @@ def preprocessing_a_record(rec): "fewshot": selected_sample, } mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) std_result = self.metric_pipeline.run_std( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) final_result = {"mean": mean_result, "std": std_result} saving_fn(generations, final_result) @@ -238,15 +213,10 @@ def __question_answering_without_context( if self.few_shot: def preprocessing_a_record(rec): - return [ - rec[ds_wrapper.dataset_info.query], - rec[ds_wrapper.dataset_info.answer], - ] + return [rec[ds_wrapper.dataset_info.query], rec[ds_wrapper.dataset_info.answer]] selected_sample_idx = list( - random.sample( - range(len(ds_wrapper.dataset_training)), self.config.num_fs - ) + random.sample(range(len(ds_wrapper.dataset_training)), self.config.num_fs) ) selected_sample = [ preprocessing_a_record(ds_wrapper.dataset_training[s]) @@ -321,11 +291,7 @@ def preprocessing_a_record(rec): saving_fn(generations) mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) print(f"Results of {idx} batches: ", mean_result) @@ -337,18 +303,10 @@ def preprocessing_a_record(rec): "fewshot": selected_sample, } mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) std_result = self.metric_pipeline.run_std( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) final_result = {"mean": mean_result, "std": std_result} saving_fn(generations, final_result) @@ -369,15 +327,10 @@ def __summarization(self, ds_wrapper, ds_loader, saving_fn, start_idx=0): if self.few_shot: def preprocessing_a_record(rec): - return [ - rec[ds_wrapper.dataset_info.source], - rec[ds_wrapper.dataset_info.target], - ] + return [rec[ds_wrapper.dataset_info.source], rec[ds_wrapper.dataset_info.target]] selected_sample_idx = list( - random.sample( - range(len(ds_wrapper.dataset_training)), self.config.num_fs - ) + random.sample(range(len(ds_wrapper.dataset_training)), self.config.num_fs) ) selected_sample = [ preprocessing_a_record(ds_wrapper.dataset_training[s]) @@ -407,9 +360,7 @@ def preprocessing_a_record(rec): ] for document in batch[ds_wrapper.dataset_info.source] ] - original_documents.extend( - [x for x in batch[ds_wrapper.dataset_info.source]] - ) + original_documents.extend([x for x in batch[ds_wrapper.dataset_info.source]]) results, logprobs, _ = self.infer_pipeline(prompts, return_probs=True) predictions.extend(results) @@ -428,11 +379,7 @@ def preprocessing_a_record(rec): } saving_fn(generations) mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) print(f"Results of {idx} batches: ", mean_result) @@ -444,18 +391,10 @@ def preprocessing_a_record(rec): "fewshot": selected_sample, } mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) std_result = self.metric_pipeline.run_std( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) final_result = {"mean": mean_result, "std": std_result} saving_fn(generations, final_result) @@ -478,16 +417,10 @@ def __multiple_choice_sentiment( generation_probs.extend(self.continue_infer_data["generation_probs"]) option_probs.extend(self.continue_infer_data["option_probs"]) if self.few_shot: - def preprocessing_a_record(rec): - return [ - rec[ds_wrapper.dataset_info.query], - rec[ds_wrapper.dataset_info.answer], - ] + return [rec[ds_wrapper.dataset_info.query], rec[ds_wrapper.dataset_info.answer]] - classes = unique( - ds_wrapper.dataset_training[ds_wrapper.dataset_info.answer] - ) + classes = unique(ds_wrapper.dataset_training[ds_wrapper.dataset_info.answer]) selected_sample = [] for cl in classes: cl_samples = ds_wrapper.dataset_training.filter( @@ -578,11 +511,7 @@ def preprocessing_a_record(rec): } saving_fn(generations) mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) print(f"Results of {idx} batches: ", mean_result) @@ -595,18 +524,10 @@ def preprocessing_a_record(rec): } mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) std_result = self.metric_pipeline.run_std( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) final_result = {"mean": mean_result, "std": std_result} saving_fn(generations, final_result) @@ -626,25 +547,24 @@ def __multiple_choice_text_classification( idx = 0 original_few_shot = [] calib_few_shot = [] - selected_sample = [] + selected_sample = [] num_choice = len(ds_wrapper.dataset_info.label) if self.few_shot: def preprocessing_a_record(rec): - return [ - rec[ds_wrapper.dataset_info.query], - rec[ds_wrapper.dataset_info.answer], - ] + return [rec[ds_wrapper.dataset_info.query], rec[ds_wrapper.dataset_info.answer]] - classes = unique( - ds_wrapper.dataset_training[ds_wrapper.dataset_info.answer] + classes = ( + unique(ds_wrapper.dataset_training[ds_wrapper.dataset_info.answer]) ) selected_sample = [] for cl in classes: cl_samples = ds_wrapper.dataset_training.filter( - lambda r: (r[ds_wrapper.dataset_info.answer] == cl) + lambda r: ( + r[ds_wrapper.dataset_info.answer] == cl + ) ) selected_sample.append( cl_samples[random.randint(0, len(cl_samples) - 1)] @@ -737,11 +657,7 @@ def preprocessing_a_record(rec): } saving_fn(generations) mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) print(f"Results of {idx} batches: ", mean_result) @@ -753,18 +669,10 @@ def preprocessing_a_record(rec): "fewshot": selected_sample, } mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) std_result = self.metric_pipeline.run_std( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) final_result = {"mean": mean_result, "std": std_result} saving_fn(generations, final_result) @@ -787,14 +695,9 @@ def __multiple_choice_toxicity(self, ds_wrapper, ds_loader, saving_fn, start_idx if self.few_shot: def preprocessing_a_record(rec): - return [ - rec[ds_wrapper.dataset_info.query], - rec[ds_wrapper.dataset_info.answer], - ] + return [rec[ds_wrapper.dataset_info.query], rec[ds_wrapper.dataset_info.answer]] - classes = unique( - ds_wrapper.dataset_training[ds_wrapper.dataset_info.answer] - ) + classes = unique(ds_wrapper.dataset_training[ds_wrapper.dataset_info.answer]) selected_sample = [] for cl in classes: cl_samples = ds_wrapper.dataset_training.filter( @@ -886,11 +789,7 @@ def preprocessing_a_record(rec): } saving_fn(generations) mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) print(f"Results of {idx} batches: ", mean_result) @@ -902,18 +801,10 @@ def preprocessing_a_record(rec): "fewshot": selected_sample, } mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) std_result = self.metric_pipeline.run_std( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) final_result = {"mean": mean_result, "std": std_result} saving_fn(generations, final_result) @@ -953,14 +844,12 @@ def preprocessing_a_record(rec): return [ rec[ds_wrapper.dataset_info.context], rec[ds_wrapper.dataset_info.query], - format_list_ans( - ast.literal_eval(rec[ds_wrapper.dataset_info.options]) - ), + format_list_ans(ast.literal_eval(rec[ds_wrapper.dataset_info.options])), rec[ds_wrapper.dataset_info.answer], ] selected_sample_idx = list( - random.sample(range(len(ds_wrapper.dataset_training)), 2) + random.sample(range(len(ds_wrapper.dataset_training)), self.config.num_fs) ) selected_sample = [ preprocessing_a_record(ds_wrapper.dataset_training[s]) @@ -986,12 +875,8 @@ def preprocessing_a_record(rec): prompts = [] calib_prompts = [] remap_order_batch = [] - for cq in zip( - batch[ds_wrapper.dataset_info.context], - batch[ds_wrapper.dataset_info.query], - batch[ds_wrapper.dataset_info.options], - ): - + for cq in zip(batch[ds_wrapper.dataset_info.context], batch[ds_wrapper.dataset_info.query], batch[ds_wrapper.dataset_info.options]): + c = cq[0] q = cq[1] opts = ast.literal_eval(cq[2]) @@ -1038,11 +923,7 @@ def preprocessing_a_record(rec): results, logprobs, _ = self.infer_pipeline(prompts, return_probs=True) option_logprobs, _ = self.infer_pipeline.compute_logprob_and_length( calib_prompts * num_choice, - [ - ds_wrapper.dataset_info.label[choice] - for choice in range(num_choice) - for _ in range(len(prompts)) - ], + [ds_wrapper.dataset_info.label[choice] for choice in range(num_choice) for _ in range(len(prompts))], ) opt_calib_out = [ [option_logprobs[i + opt * len(prompts)] for opt in range(num_choice)] @@ -1057,12 +938,8 @@ def preprocessing_a_record(rec): # Map the reference to the new order references.extend( [ - ds_wrapper.dataset_info.label[ - remap.index(ds_wrapper.dataset_info.label.index(x)) - ] - for x, remap in zip( - batch[ds_wrapper.dataset_info.answer], remap_order_batch - ) + ds_wrapper.dataset_info.label[remap.index(ds_wrapper.dataset_info.label.index(x))] + for x, remap in zip(batch[ds_wrapper.dataset_info.answer], remap_order_batch) ] ) @@ -1081,11 +958,7 @@ def preprocessing_a_record(rec): } saving_fn(generations) mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) print(f"Results of {idx} batches: ", mean_result) @@ -1099,18 +972,10 @@ def preprocessing_a_record(rec): } mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) std_result = self.metric_pipeline.run_std( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) final_result = {"mean": mean_result, "std": std_result} saving_fn(generations, final_result) @@ -1129,13 +994,14 @@ def __language_modeling(self, ds_wrapper, ds_loader, saving_fn, start_idx=0): if self.few_shot: def preprocessing_a_record(rec): - return [ - rec[ds_wrapper.dataset_info.source], - rec[ds_wrapper.dataset_info.target], - ] - + return [rec[ds_wrapper.dataset_info.source], rec[ds_wrapper.dataset_info.target]] + + selected_sample_idx = list( + random.sample(range(len(ds_wrapper.dataset_training)), self.config.num_fs) + ) selected_sample = [ - preprocessing_a_record(s) for s in ds_wrapper.dataset_training + preprocessing_a_record(ds_wrapper.dataset_training[s]) + for s in selected_sample_idx ] original_few_shot = format_fewshot( selected_sample, @@ -1179,11 +1045,7 @@ def preprocessing_a_record(rec): } saving_fn(generations) mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) print(f"Results of {idx} batches: ", mean_result) @@ -1194,18 +1056,10 @@ def preprocessing_a_record(rec): "fewshot": selected_sample, } mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) std_result = self.metric_pipeline.run_std( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) final_result = {"mean": mean_result, "std": std_result} saving_fn(generations, final_result) @@ -1250,12 +1104,12 @@ def preprocessing_a_record(rec): first_sample = { "passages": random_sample["positive"], "query": random_sample[ds_wrapper.dataset_info.query], - "references": "Yes", + "references": ds_wrapper.dataset_info.label[0], } second_sample = { "passages": random_sample["negative"], "query": random_sample[ds_wrapper.dataset_info.query], - "references": "No", + "references": ds_wrapper.dataset_info.label[1], } selected_sample = [ @@ -1278,11 +1132,9 @@ def preprocessing_a_record(rec): if idx < start_idx: idx += 1 continue - for query_with_a_batch_passages in range(len(batch[ds_wrapper.id])): - query_id = batch[ds_wrapper.id][query_with_a_batch_passages] - query = batch[ds_wrapper.dataset_info.query][ - query_with_a_batch_passages - ] + for query_with_a_batch_passages in range(len(batch[ds_wrapper.dataset_info.type_id])): + query_id = batch[ds_wrapper.dataset_info.type_id][query_with_a_batch_passages] + query = batch[ds_wrapper.dataset_info.query][query_with_a_batch_passages] try: ref_passage_id = batch[ds_wrapper.dataset_info.answer][0][ query_with_a_batch_passages @@ -1345,10 +1197,10 @@ def preprocessing_a_record(rec): ) option_logprobs, _ = self.infer_pipeline.compute_logprob_and_length( - calib_prompts * 2, + calib_prompts * len(ds_wrapper.dataset_info.label), [ choice - for choice in ["Yes", "No"] + for choice in ds_wrapper.dataset_info.label for _ in range(len(prompts)) ], ) @@ -1372,7 +1224,7 @@ def preprocessing_a_record(rec): "generation_probs": y, "calib_probs": [ option_logprobs[q + opt * len(prompts)] - for opt in range(2) + for opt in range(len(ds_wrapper.dataset_info.label)) ], }, results, @@ -1391,28 +1243,16 @@ def preprocessing_a_record(rec): generations = {"fewshot": selected_sample, "predictions": predictions} saving_fn(generations) mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config, ref_dataset=ds_wrapper.dataset_testing ) print(f"Results of {idx} batches: ", mean_result) generations = {"fewshot": selected_sample, "predictions": predictions} mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config, ref_dataset=ds_wrapper.dataset_testing ) std_result = self.metric_pipeline.run_std( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config, ref_dataset=ds_wrapper.dataset_testing ) final_result = {"mean": mean_result, "std": std_result} saving_fn(generations, final_result) @@ -1426,26 +1266,21 @@ def __reasoning(self, ds_wrapper, ds_loader, saving_fn, start_idx=0): original_few_shot = [] calib_few_shot = [] selected_sample = [] - + if self.continue_infer_data is not None: predictions.extend(self.continue_infer_data["predictions"]) references.extend(self.continue_infer_data["references"]) generation_probs.extend(self.continue_infer_data["generation_probs"]) calib_probs.extend(self.continue_infer_data["calibration_probs"]) - + if self.few_shot: def preprocessing_a_record(rec): - return [ - rec[ds_wrapper.dataset_info.query], - rec[ds_wrapper.dataset_info.answer], - ] + return [rec[ds_wrapper.dataset_info.query], rec[ds_wrapper.dataset_info.answer]] selected_sample = [ preprocessing_a_record(s) - for s in list( - random.sample(list(ds_wrapper.dataset_training), self.config.num_fs) - ) + for s in list(random.sample(list(ds_wrapper.dataset_training), self.config.num_fs)) ] original_few_shot = format_fewshot( selected_sample, @@ -1507,14 +1342,10 @@ def preprocessing_a_record(rec): "calibration_probs": calib_probs, "fewshot": selected_sample, } - + saving_fn(generations) mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) print(f"Results of {idx} batches: ", mean_result) @@ -1525,25 +1356,17 @@ def preprocessing_a_record(rec): "calibration_probs": calib_probs, "fewshot": selected_sample, } - + mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) std_result = self.metric_pipeline.run_std( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) final_result = {"mean": mean_result, "std": std_result} saving_fn(generations, final_result) - + def __math(self, ds_wrapper, ds_loader, saving_fn, start_idx=0): predictions = [] references = [] @@ -1554,7 +1377,7 @@ def __math(self, ds_wrapper, ds_loader, saving_fn, start_idx=0): original_few_shot = [] calib_few_shot = [] selected_sample = [] - pattern = regex.compile(r"\\boxed\{(?:[^{}]|(?R))*\}") + pattern = regex.compile(r'\\boxed\{(?:[^{}]|(?R))*\}') # res_list = pattern.findall(text) # return res_list[0] if res_list else None if self.continue_infer_data is not None: @@ -1568,16 +1391,11 @@ def __math(self, ds_wrapper, ds_loader, saving_fn, start_idx=0): if self.few_shot: def preprocessing_a_record(rec): - return [ - rf"{rec[ds_wrapper.dataset_info.query]}", - rf"{rec[ds_wrapper.dataset_info.answer]}", - ] + return [fr"{rec[ds_wrapper.dataset_info.query]}", fr"{rec[ds_wrapper.dataset_info.answer]}"] selected_sample = [ preprocessing_a_record(s) - for s in list( - random.sample(list(ds_wrapper.dataset_training), self.config.num_fs) - ) + for s in list(random.sample(list(ds_wrapper.dataset_training), self.config.num_fs)) ] original_few_shot = format_fewshot( selected_sample, @@ -1589,7 +1407,7 @@ def preprocessing_a_record(rec): query_format=ds_wrapper.calibration_prompt["prompt"], answer_format=ds_wrapper.prompt["answer_format"], ) - + for batch in tqdm(ds_loader): if idx < start_idx: idx += 1 @@ -1600,7 +1418,7 @@ def preprocessing_a_record(rec): *original_few_shot, { "role": "user", - "content": ds_wrapper.prompt["prompt"].format(rf"{rule}"), + "content": ds_wrapper.prompt["prompt"].format(fr"{rule}"), }, ] for rule in batch[ds_wrapper.dataset_info.query] @@ -1614,9 +1432,7 @@ def preprocessing_a_record(rec): *calib_few_shot, { "role": "user", - "content": ds_wrapper.calibration_prompt["prompt"].format( - rf"{rule}" - ), + "content": ds_wrapper.calibration_prompt["prompt"].format(fr"{rule}"), }, ] for rule in batch[ds_wrapper.dataset_info.query] @@ -1630,9 +1446,7 @@ def preprocessing_a_record(rec): references.extend([x for x in batch[ds_wrapper.dataset_info.answer]]) generation_probs.extend(logprobs) calib_probs.extend(calibprob_batch) - math_problem_type.extend( - [x for x in batch[ds_wrapper.dataset_info.type_id]] - ) + math_problem_type.extend([x for x in batch[ds_wrapper.dataset_info.type_id]]) idx += 1 if idx % 100 == 0: print(f"Saving results of {idx} batches") @@ -1642,16 +1456,12 @@ def preprocessing_a_record(rec): "generation_probs": generation_probs, "calibration_probs": calib_probs, "fewshot": selected_sample, - "math_problem_type": math_problem_type, + "math_problem_type": math_problem_type } saving_fn(generations) mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) print(f"Results of {idx} batches: ", mean_result) @@ -1661,22 +1471,14 @@ def preprocessing_a_record(rec): "generation_probs": generation_probs, "calibration_probs": calib_probs, "fewshot": selected_sample, - "math_problem_type": math_problem_type, + "math_problem_type": math_problem_type } - + mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) std_result = self.metric_pipeline.run_std( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) final_result = {"mean": mean_result, "std": std_result} @@ -1703,9 +1505,7 @@ def preprocessing_a_record(rec): selected_sample = [ preprocessing_a_record(s) - for s in list( - random.sample(list(ds_wrapper.dataset_training), self.config.num_fs) - ) + for s in list(random.sample(list(ds_wrapper.dataset_training), self.config.num_fs)) ] original_few_shot = format_fewshot( selected_sample, @@ -1749,11 +1549,7 @@ def preprocessing_a_record(rec): } saving_fn(generations) mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) print(f"Results of {idx} batches: ", mean_result) @@ -1764,18 +1560,10 @@ def preprocessing_a_record(rec): "fewshot": selected_sample, } mean_result = self.metric_pipeline.run_mean( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) std_result = self.metric_pipeline.run_std( - generations, - self.task_name, - ds_wrapper.prompt["answer_key"], - ds_wrapper.dataset_info.label, - self.config, + generations, self.task_name, ds_wrapper.prompt["answer_key"], ds_wrapper.dataset_info.label, self.config ) final_result = {"mean": mean_result, "std": std_result} saving_fn(generations, final_result)