From f1c9e292d78f2b1b87aa4e90d05d9c4bd28d1e58 Mon Sep 17 00:00:00 2001
From: Zening Lin <zening.lin@outlook.com>
Date: Mon, 31 Oct 2022 09:51:30 +0800
Subject: [PATCH 1/2] <opti> make path to the xfund&funsd flexible

---
 LiLTfinetune/data/data_args.py     |  1 +
 LiLTfinetune/data/datasets/xfun.py | 24 ++++++++----------------
 README.md                          | 14 +++++++++-----
 examples/run_xfun_re.py            |  1 +
 examples/run_xfun_ser.py           |  1 +
 5 files changed, 20 insertions(+), 21 deletions(-)
diff --git a/LiLTfinetune/data/data_args.py b/LiLTfinetune/data/data_args.py
index f618e02..0e69acc 100644
--- a/LiLTfinetune/data/data_args.py
+++ b/LiLTfinetune/data/data_args.py
@@ -73,6 +73,7 @@ class DataTrainingArguments:
         default=False,
         metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
     )
+    data_dir: str = field(default=None, metadata={"help": "dir to dataset"})
 
 
 @dataclass
diff --git a/LiLTfinetune/data/datasets/xfun.py b/LiLTfinetune/data/datasets/xfun.py
index e2ac881..93f4bd8 100644
--- a/LiLTfinetune/data/datasets/xfun.py
+++ b/LiLTfinetune/data/datasets/xfun.py
@@ -71,18 +71,11 @@ def _info(self):
 
     def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
-        # urls_to_download = {
-        #     "train": [f"{_URL}{self.config.lang}.train.json", f"{_URL}{self.config.lang}.train.zip"],
-        #     "val": [f"{_URL}{self.config.lang}.val.json", f"{_URL}{self.config.lang}.val.zip"],
-        #     # "test": [f"{_URL}{self.config.lang}.test.json", f"{_URL}{self.config.lang}.test.zip"],
-        # }
-        # downloaded_files = dl_manager.download_and_extract(urls_to_download)
-        # train_files_for_many_langs = [downloaded_files["train"]]
-        # val_files_for_many_langs = [downloaded_files["val"]]
-        # # test_files_for_many_langs = [downloaded_files["test"]]
-        file_dir = 'xfund&funsd/'
-        train_files_for_many_langs = [[file_dir+f"{self.config.lang}.train.json", file_dir+f"{self.config.lang}"]]
-        val_files_for_many_langs = [[file_dir+f"{self.config.lang}.val.json", file_dir+f"{self.config.lang}"]]
+
+        file_dir = self.config.data_dir
+
+        train_files_for_many_langs = [[file_dir + f"{self.config.lang}.train.json", file_dir + f"{self.config.lang}"]]
+        val_files_for_many_langs = [[file_dir + f"{self.config.lang}.val.json", file_dir + f"{self.config.lang}"]]
 
         if self.config.additional_langs:
             additional_langs = self.config.additional_langs.split("+")
@@ -92,8 +85,7 @@ def _split_generators(self, dl_manager):
                 # urls_to_download = {"train": [f"{_URL}{lang}.train.json", f"{_URL}{lang}.train.zip"]}
                 # additional_downloaded_files = dl_manager.download_and_extract(urls_to_download)
                 # train_files_for_many_langs.append(additional_downloaded_files["train"])
-                train_files_for_many_langs.append([file_dir+f"{lang}.train.json", file_dir+f"{lang}"])
-
+                train_files_for_many_langs.append([file_dir + f"{lang}.train.json", file_dir + f"{lang}"])
 
         logger.info(f"Training on {self.config.lang} with additional langs({self.config.additional_langs})")
         logger.info(f"Evaluating on {self.config.lang}")
@@ -128,9 +120,9 @@ def _generate_examples(self, filepaths):
                         continue
                     id2label[line["id"]] = line["label"]
                     relations.extend([tuple(sorted(l)) for l in line["linking"]])
-                    if '/en' in filepath[0]:
+                    if "/en" in filepath[0]:
                         tokenized_inputs = self.tokenizer(
-                            ' '.join([q['text'].replace(u'\uf703','') for q in line['words']]),
+                            " ".join([q["text"].replace("\uf703", "") for q in line["words"]]),
                             add_special_tokens=False,
                             return_offsets_mapping=True,
                             return_attention_mask=False,
diff --git a/README.md b/README.md
index ae43c36..57786d4 100644
--- a/README.md
+++ b/README.md
@@ -29,11 +29,6 @@ pip install -e .
 
 Or check [Detectron2](https://github.com/facebookresearch/detectron2/releases)/[PyTorch](https://pytorch.org/get-started/previous-versions/) versions and modify the command lines accordingly.
 
-## Datasets
-
-In this repository, we provide the fine-tuning codes for [FUNSD](https://guillaumejaume.github.io/FUNSD/) and [XFUND](https://github.com/doc-analysis/XFUND). 
-
-You can download our **pre-processed data (~1.2GB)** from [**HERE**](https://1drv.ms/u/s!Ahd-h7H5akVZeZQvKieg8g5THV8?e=mBRnxw), and put the unzipped `xfund&funsd/` under `LiLT/`. 
 
 ## Available Checkpoints
 
@@ -76,6 +71,11 @@ python gen_weight_roberta_like.py \
 
 ## Fine-tuning
 
+In this repository, we provide the fine-tuning codes for [FUNSD](https://guillaumejaume.github.io/FUNSD/) and [XFUND](https://github.com/doc-analysis/XFUND). 
+
+When fine-tuning on `FUNSD`, the dataset will be automatically download from [here](https://guillaumejaume.github.io/FUNSD/dataset.zip).
+
+When fine-tuning on `XFUND`, the **pre-processed data (~1.2GB)** should be downloaded from [**HERE**](https://1drv.ms/u/s!Ahd-h7H5akVZeZQvKieg8g5THV8?e=mBRnxw) in advance. The path to the unzipped data can be specified through the *--data_dir* args.
 
 ### Semantic Entity Recognition on FUNSD
 
@@ -99,6 +99,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node
         --model_name_or_path lilt-infoxlm-base \
         --tokenizer_name xlm-roberta-base \
         --output_dir ls_ser_xfund_zh_lilt-infoxlm-base \
+        --data_dir {dir_to_preprocessed_xfund_data} \
         --do_train \
         --do_eval \
         --lang zh \
@@ -115,6 +116,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node
         --model_name_or_path lilt-infoxlm-base \
         --tokenizer_name xlm-roberta-base \
         --output_dir ls_re_xfund_zh_lilt-infoxlm-base \
+        --data_dir {dir_to_preprocessed_xfund_data} \
         --do_train \
         --do_eval \
         --lang zh \
@@ -132,6 +134,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node
         --model_name_or_path lilt-infoxlm-base \
         --tokenizer_name xlm-roberta-base \
         --output_dir mt_ser_xfund_all_lilt-infoxlm-base \
+        --data_dir {dir_to_preprocessed_xfund_data} \
         --do_train \
         --additional_langs all \
         --max_steps 16000 \
@@ -147,6 +150,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node
         --model_name_or_path lilt-infoxlm-base \
         --tokenizer_name xlm-roberta-base \
         --output_dir mt_re_xfund_all_lilt-infoxlm-base \
+        --data_dir {dir_to_preprocessed_xfund_data} \
         --do_train \
         --additional_langs all \
         --max_steps 40000 \
diff --git a/examples/run_xfun_re.py b/examples/run_xfun_re.py
index e40d60b..84c0f89 100644
--- a/examples/run_xfun_re.py
+++ b/examples/run_xfun_re.py
@@ -84,6 +84,7 @@ def main():
     datasets = load_dataset(
         os.path.abspath(LiLTfinetune.data.datasets.xfun.__file__),
         f"xfun.{data_args.lang}",
+        data_dir=data_args.data_dir,
         additional_langs=data_args.additional_langs,
         keep_in_memory=True,
     )
diff --git a/examples/run_xfun_ser.py b/examples/run_xfun_ser.py
index 9fc7593..6fd5880 100644
--- a/examples/run_xfun_ser.py
+++ b/examples/run_xfun_ser.py
@@ -86,6 +86,7 @@ def main():
     datasets = load_dataset(
         os.path.abspath(LiLTfinetune.data.datasets.xfun.__file__),
         f"xfun.{data_args.lang}",
+        data_dir=data_args.data_dir,
         additional_langs=data_args.additional_langs,
         keep_in_memory=True,
     )

From 72f4027fe9042679000dd67f0bf222801b65093c Mon Sep 17 00:00:00 2001
From: Zening Lin <zening.lin@outlook.com>
Date: Mon, 31 Oct 2022 09:52:01 +0800
Subject: [PATCH 2/2] <bugFix> fix data args error when training funsd

---
 examples/run_funsd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_funsd.py b/examples/run_funsd.py
index d49e045..4edb8f5 100644
--- a/examples/run_funsd.py
+++ b/examples/run_funsd.py
@@ -302,7 +302,7 @@ def compute_metrics(p):
         model=model,
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
+        eval_dataset=test_dataset if training_args.do_predict else None,
         tokenizer=tokenizer,
         data_collator=data_collator,
         compute_metrics=compute_metrics,