Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize and Bug-Fix #26

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions LiLTfinetune/data/data_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class DataTrainingArguments:
default=False,
metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
)
data_dir: str = field(default=None, metadata={"help": "dir to dataset"})


@dataclass
Expand Down
24 changes: 8 additions & 16 deletions LiLTfinetune/data/datasets/xfun.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,18 +71,11 @@ def _info(self):

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
# urls_to_download = {
# "train": [f"{_URL}{self.config.lang}.train.json", f"{_URL}{self.config.lang}.train.zip"],
# "val": [f"{_URL}{self.config.lang}.val.json", f"{_URL}{self.config.lang}.val.zip"],
# # "test": [f"{_URL}{self.config.lang}.test.json", f"{_URL}{self.config.lang}.test.zip"],
# }
# downloaded_files = dl_manager.download_and_extract(urls_to_download)
# train_files_for_many_langs = [downloaded_files["train"]]
# val_files_for_many_langs = [downloaded_files["val"]]
# # test_files_for_many_langs = [downloaded_files["test"]]
file_dir = 'xfund&funsd/'
train_files_for_many_langs = [[file_dir+f"{self.config.lang}.train.json", file_dir+f"{self.config.lang}"]]
val_files_for_many_langs = [[file_dir+f"{self.config.lang}.val.json", file_dir+f"{self.config.lang}"]]

file_dir = self.config.data_dir

train_files_for_many_langs = [[file_dir + f"{self.config.lang}.train.json", file_dir + f"{self.config.lang}"]]
val_files_for_many_langs = [[file_dir + f"{self.config.lang}.val.json", file_dir + f"{self.config.lang}"]]

if self.config.additional_langs:
additional_langs = self.config.additional_langs.split("+")
Expand All @@ -92,8 +85,7 @@ def _split_generators(self, dl_manager):
# urls_to_download = {"train": [f"{_URL}{lang}.train.json", f"{_URL}{lang}.train.zip"]}
# additional_downloaded_files = dl_manager.download_and_extract(urls_to_download)
# train_files_for_many_langs.append(additional_downloaded_files["train"])
train_files_for_many_langs.append([file_dir+f"{lang}.train.json", file_dir+f"{lang}"])

train_files_for_many_langs.append([file_dir + f"{lang}.train.json", file_dir + f"{lang}"])

logger.info(f"Training on {self.config.lang} with additional langs({self.config.additional_langs})")
logger.info(f"Evaluating on {self.config.lang}")
Expand Down Expand Up @@ -128,9 +120,9 @@ def _generate_examples(self, filepaths):
continue
id2label[line["id"]] = line["label"]
relations.extend([tuple(sorted(l)) for l in line["linking"]])
if '/en' in filepath[0]:
if "/en" in filepath[0]:
tokenized_inputs = self.tokenizer(
' '.join([q['text'].replace(u'\uf703','') for q in line['words']]),
" ".join([q["text"].replace("\uf703", "") for q in line["words"]]),
add_special_tokens=False,
return_offsets_mapping=True,
return_attention_mask=False,
Expand Down
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,6 @@ pip install -e .

Or check [Detectron2](https://github.com/facebookresearch/detectron2/releases)/[PyTorch](https://pytorch.org/get-started/previous-versions/) versions and modify the command lines accordingly.

## Datasets

In this repository, we provide the fine-tuning codes for [FUNSD](https://guillaumejaume.github.io/FUNSD/) and [XFUND](https://github.com/doc-analysis/XFUND).

You can download our **pre-processed data (~1.2GB)** from [**HERE**](https://1drv.ms/u/s!Ahd-h7H5akVZeZQvKieg8g5THV8?e=mBRnxw), and put the unzipped `xfund&funsd/` under `LiLT/`.

## Available Checkpoints

Expand Down Expand Up @@ -76,6 +71,11 @@ python gen_weight_roberta_like.py \

## Fine-tuning

In this repository, we provide the fine-tuning codes for [FUNSD](https://guillaumejaume.github.io/FUNSD/) and [XFUND](https://github.com/doc-analysis/XFUND).

When fine-tuning on `FUNSD`, the dataset will be automatically download from [here](https://guillaumejaume.github.io/FUNSD/dataset.zip).

When fine-tuning on `XFUND`, the **pre-processed data (~1.2GB)** should be downloaded from [**HERE**](https://1drv.ms/u/s!Ahd-h7H5akVZeZQvKieg8g5THV8?e=mBRnxw) in advance. The path to the unzipped data can be specified through the *--data_dir* args.

### Semantic Entity Recognition on FUNSD

Expand All @@ -99,6 +99,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node
--model_name_or_path lilt-infoxlm-base \
--tokenizer_name xlm-roberta-base \
--output_dir ls_ser_xfund_zh_lilt-infoxlm-base \
--data_dir {dir_to_preprocessed_xfund_data} \
--do_train \
--do_eval \
--lang zh \
Expand All @@ -115,6 +116,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node
--model_name_or_path lilt-infoxlm-base \
--tokenizer_name xlm-roberta-base \
--output_dir ls_re_xfund_zh_lilt-infoxlm-base \
--data_dir {dir_to_preprocessed_xfund_data} \
--do_train \
--do_eval \
--lang zh \
Expand All @@ -132,6 +134,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node
--model_name_or_path lilt-infoxlm-base \
--tokenizer_name xlm-roberta-base \
--output_dir mt_ser_xfund_all_lilt-infoxlm-base \
--data_dir {dir_to_preprocessed_xfund_data} \
--do_train \
--additional_langs all \
--max_steps 16000 \
Expand All @@ -147,6 +150,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node
--model_name_or_path lilt-infoxlm-base \
--tokenizer_name xlm-roberta-base \
--output_dir mt_re_xfund_all_lilt-infoxlm-base \
--data_dir {dir_to_preprocessed_xfund_data} \
--do_train \
--additional_langs all \
--max_steps 40000 \
Expand Down
2 changes: 1 addition & 1 deletion examples/run_funsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def compute_metrics(p):
model=model,
args=training_args,
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset=eval_dataset if training_args.do_eval else None,
eval_dataset=test_dataset if training_args.do_predict else None,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
Expand Down
1 change: 1 addition & 0 deletions examples/run_xfun_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def main():
datasets = load_dataset(
os.path.abspath(LiLTfinetune.data.datasets.xfun.__file__),
f"xfun.{data_args.lang}",
data_dir=data_args.data_dir,
additional_langs=data_args.additional_langs,
keep_in_memory=True,
)
Expand Down
1 change: 1 addition & 0 deletions examples/run_xfun_ser.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def main():
datasets = load_dataset(
os.path.abspath(LiLTfinetune.data.datasets.xfun.__file__),
f"xfun.{data_args.lang}",
data_dir=data_args.data_dir,
additional_langs=data_args.additional_langs,
keep_in_memory=True,
)
Expand Down