From f1c9e292d78f2b1b87aa4e90d05d9c4bd28d1e58 Mon Sep 17 00:00:00 2001 From: Zening Lin Date: Mon, 31 Oct 2022 09:51:30 +0800 Subject: [PATCH 1/2] make path to the xfund&funsd flexible --- LiLTfinetune/data/data_args.py | 1 + LiLTfinetune/data/datasets/xfun.py | 24 ++++++++---------------- README.md | 14 +++++++++----- examples/run_xfun_re.py | 1 + examples/run_xfun_ser.py | 1 + 5 files changed, 20 insertions(+), 21 deletions(-) diff --git a/LiLTfinetune/data/data_args.py b/LiLTfinetune/data/data_args.py index f618e02..0e69acc 100644 --- a/LiLTfinetune/data/data_args.py +++ b/LiLTfinetune/data/data_args.py @@ -73,6 +73,7 @@ class DataTrainingArguments: default=False, metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."}, ) + data_dir: str = field(default=None, metadata={"help": "dir to dataset"}) @dataclass diff --git a/LiLTfinetune/data/datasets/xfun.py b/LiLTfinetune/data/datasets/xfun.py index e2ac881..93f4bd8 100644 --- a/LiLTfinetune/data/datasets/xfun.py +++ b/LiLTfinetune/data/datasets/xfun.py @@ -71,18 +71,11 @@ def _info(self): def _split_generators(self, dl_manager): """Returns SplitGenerators.""" - # urls_to_download = { - # "train": [f"{_URL}{self.config.lang}.train.json", f"{_URL}{self.config.lang}.train.zip"], - # "val": [f"{_URL}{self.config.lang}.val.json", f"{_URL}{self.config.lang}.val.zip"], - # # "test": [f"{_URL}{self.config.lang}.test.json", f"{_URL}{self.config.lang}.test.zip"], - # } - # downloaded_files = dl_manager.download_and_extract(urls_to_download) - # train_files_for_many_langs = [downloaded_files["train"]] - # val_files_for_many_langs = [downloaded_files["val"]] - # # test_files_for_many_langs = [downloaded_files["test"]] - file_dir = 'xfund&funsd/' - train_files_for_many_langs = [[file_dir+f"{self.config.lang}.train.json", file_dir+f"{self.config.lang}"]] - val_files_for_many_langs = [[file_dir+f"{self.config.lang}.val.json", file_dir+f"{self.config.lang}"]] + + file_dir = self.config.data_dir + + train_files_for_many_langs = [[file_dir + f"{self.config.lang}.train.json", file_dir + f"{self.config.lang}"]] + val_files_for_many_langs = [[file_dir + f"{self.config.lang}.val.json", file_dir + f"{self.config.lang}"]] if self.config.additional_langs: additional_langs = self.config.additional_langs.split("+") @@ -92,8 +85,7 @@ def _split_generators(self, dl_manager): # urls_to_download = {"train": [f"{_URL}{lang}.train.json", f"{_URL}{lang}.train.zip"]} # additional_downloaded_files = dl_manager.download_and_extract(urls_to_download) # train_files_for_many_langs.append(additional_downloaded_files["train"]) - train_files_for_many_langs.append([file_dir+f"{lang}.train.json", file_dir+f"{lang}"]) - + train_files_for_many_langs.append([file_dir + f"{lang}.train.json", file_dir + f"{lang}"]) logger.info(f"Training on {self.config.lang} with additional langs({self.config.additional_langs})") logger.info(f"Evaluating on {self.config.lang}") @@ -128,9 +120,9 @@ def _generate_examples(self, filepaths): continue id2label[line["id"]] = line["label"] relations.extend([tuple(sorted(l)) for l in line["linking"]]) - if '/en' in filepath[0]: + if "/en" in filepath[0]: tokenized_inputs = self.tokenizer( - ' '.join([q['text'].replace(u'\uf703','') for q in line['words']]), + " ".join([q["text"].replace("\uf703", "") for q in line["words"]]), add_special_tokens=False, return_offsets_mapping=True, return_attention_mask=False, diff --git a/README.md b/README.md index ae43c36..57786d4 100644 --- a/README.md +++ b/README.md @@ -29,11 +29,6 @@ pip install -e . Or check [Detectron2](https://github.com/facebookresearch/detectron2/releases)/[PyTorch](https://pytorch.org/get-started/previous-versions/) versions and modify the command lines accordingly. -## Datasets - -In this repository, we provide the fine-tuning codes for [FUNSD](https://guillaumejaume.github.io/FUNSD/) and [XFUND](https://github.com/doc-analysis/XFUND). - -You can download our **pre-processed data (~1.2GB)** from [**HERE**](https://1drv.ms/u/s!Ahd-h7H5akVZeZQvKieg8g5THV8?e=mBRnxw), and put the unzipped `xfund&funsd/` under `LiLT/`. ## Available Checkpoints @@ -76,6 +71,11 @@ python gen_weight_roberta_like.py \ ## Fine-tuning +In this repository, we provide the fine-tuning codes for [FUNSD](https://guillaumejaume.github.io/FUNSD/) and [XFUND](https://github.com/doc-analysis/XFUND). + +When fine-tuning on `FUNSD`, the dataset will be automatically download from [here](https://guillaumejaume.github.io/FUNSD/dataset.zip). + +When fine-tuning on `XFUND`, the **pre-processed data (~1.2GB)** should be downloaded from [**HERE**](https://1drv.ms/u/s!Ahd-h7H5akVZeZQvKieg8g5THV8?e=mBRnxw) in advance. The path to the unzipped data can be specified through the *--data_dir* args. ### Semantic Entity Recognition on FUNSD @@ -99,6 +99,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node --model_name_or_path lilt-infoxlm-base \ --tokenizer_name xlm-roberta-base \ --output_dir ls_ser_xfund_zh_lilt-infoxlm-base \ + --data_dir {dir_to_preprocessed_xfund_data} \ --do_train \ --do_eval \ --lang zh \ @@ -115,6 +116,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node --model_name_or_path lilt-infoxlm-base \ --tokenizer_name xlm-roberta-base \ --output_dir ls_re_xfund_zh_lilt-infoxlm-base \ + --data_dir {dir_to_preprocessed_xfund_data} \ --do_train \ --do_eval \ --lang zh \ @@ -132,6 +134,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node --model_name_or_path lilt-infoxlm-base \ --tokenizer_name xlm-roberta-base \ --output_dir mt_ser_xfund_all_lilt-infoxlm-base \ + --data_dir {dir_to_preprocessed_xfund_data} \ --do_train \ --additional_langs all \ --max_steps 16000 \ @@ -147,6 +150,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node --model_name_or_path lilt-infoxlm-base \ --tokenizer_name xlm-roberta-base \ --output_dir mt_re_xfund_all_lilt-infoxlm-base \ + --data_dir {dir_to_preprocessed_xfund_data} \ --do_train \ --additional_langs all \ --max_steps 40000 \ diff --git a/examples/run_xfun_re.py b/examples/run_xfun_re.py index e40d60b..84c0f89 100644 --- a/examples/run_xfun_re.py +++ b/examples/run_xfun_re.py @@ -84,6 +84,7 @@ def main(): datasets = load_dataset( os.path.abspath(LiLTfinetune.data.datasets.xfun.__file__), f"xfun.{data_args.lang}", + data_dir=data_args.data_dir, additional_langs=data_args.additional_langs, keep_in_memory=True, ) diff --git a/examples/run_xfun_ser.py b/examples/run_xfun_ser.py index 9fc7593..6fd5880 100644 --- a/examples/run_xfun_ser.py +++ b/examples/run_xfun_ser.py @@ -86,6 +86,7 @@ def main(): datasets = load_dataset( os.path.abspath(LiLTfinetune.data.datasets.xfun.__file__), f"xfun.{data_args.lang}", + data_dir=data_args.data_dir, additional_langs=data_args.additional_langs, keep_in_memory=True, ) From 72f4027fe9042679000dd67f0bf222801b65093c Mon Sep 17 00:00:00 2001 From: Zening Lin Date: Mon, 31 Oct 2022 09:52:01 +0800 Subject: [PATCH 2/2] fix data args error when training funsd --- examples/run_funsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_funsd.py b/examples/run_funsd.py index d49e045..4edb8f5 100644 --- a/examples/run_funsd.py +++ b/examples/run_funsd.py @@ -302,7 +302,7 @@ def compute_metrics(p): model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, + eval_dataset=test_dataset if training_args.do_predict else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics,