From 5624d1fb05a1968399ede25ac1c4577cc9eab5e4 Mon Sep 17 00:00:00 2001 From: Jett Date: Wed, 22 May 2024 12:37:08 +0200 Subject: [PATCH] use util in tokenize_dataset --- scripts/tokenize_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/tokenize_dataset.py b/scripts/tokenize_dataset.py index a74df312..c0c6d13f 100755 --- a/scripts/tokenize_dataset.py +++ b/scripts/tokenize_dataset.py @@ -8,6 +8,7 @@ from huggingface_hub import HfApi from transformers import AutoTokenizer +from delphi import utils from delphi.dataset.tokenization import get_tokenized_chunks if __name__ == "__main__": @@ -107,7 +108,7 @@ ) print(f"Tokenizing split='{args.split}'...") - split_name = args.split.split("[")[0] + split_name = utils.hf_split_to_split_name(args.split) for chunk_idx, ds_chunk in enumerate(ds_chunks_it): chunk_name = f"{split_name}-{chunk_idx:05}.parquet" if args.out_dir: