From 51a8e57f04da25898a7caa6489c418d96d63958b Mon Sep 17 00:00:00 2001 From: Jett Date: Fri, 19 Apr 2024 11:41:46 -0700 Subject: [PATCH] tokenizer training script (#103) * draft for tokenizer training script * Update scripts/train_tokenizer.py Co-authored-by: Jett * integrate some of the suggested changes (WIP) * another use of tempfile (untested) * bug fixes and test script * split the code, reduced num. tmp files, ... * removed tests/scripts * updated transformers to 4.40.0 --------- Co-authored-by: Jannik Brinkmann Co-authored-by: Jannik Brinkmann <62884101+jannik-brinkmann@users.noreply.github.com> --- pyproject.toml | 2 +- scripts/train_tokenizer.py | 97 +++++++++++++++++++ src/delphi/train/tokenizer.py | 81 ++++++++++++++++ .../functional_test_generate_logprobs.sh | 22 ----- 4 files changed, 179 insertions(+), 23 deletions(-) create mode 100755 scripts/train_tokenizer.py create mode 100644 src/delphi/train/tokenizer.py delete mode 100644 tests/scripts/functional_test_generate_logprobs.sh diff --git a/pyproject.toml b/pyproject.toml index 5ef5795b..c7742381 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "dacite==1.8.1", "panel==1.4.0", "jupyter_bokeh==4.0.1", - "transformers==4.39.2", + "transformers==4.40.0", ] [project.optional-dependencies] diff --git a/scripts/train_tokenizer.py b/scripts/train_tokenizer.py new file mode 100755 index 00000000..f4585671 --- /dev/null +++ b/scripts/train_tokenizer.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +import argparse +import tempfile + +from delphi.train.tokenizer import ( + hf_bpe_tokenizer_to_llama_tokenizer, + hf_dataset_to_text, + sp_processor_to_hf_bpe_tokenizer, + train_sentence_piece, +) + + +def main( + *, + vocab_size: int, + dataset_name: str, + split: str, + column: str, + repo_id: str, + hf_token: str, +): + """Trains a SentencePiece tokenizer, converts it to LlamaTokenizerFast and pushes it to the Hugging Face Hub.""" + with tempfile.TemporaryFile(mode="w+") as text_file: + print("Loading and writing dataset to text file...") + hf_dataset_to_text( + dataset_name=dataset_name, + split=split, + column=column, + text_file=text_file, + ) + text_file.seek(0) + print("Training SentencePiece tokenizer...\n") + sp_processor = train_sentence_piece( + vocab_size=vocab_size, + sentence_iterator=text_file, + ) + print("\nConverting SentencePiece tokenizer Llama tokenizer...") + hf_bpe_tokenizer = sp_processor_to_hf_bpe_tokenizer(sp_processor) + llama_tokenizer = hf_bpe_tokenizer_to_llama_tokenizer(hf_bpe_tokenizer) + print("Pushing Llama tokenizer to Hugging Face Hub...") + llama_tokenizer.push_to_hub( + repo_id=repo_id, + token=hf_token, + ) + print("Done.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Train a SentencePiece tokenizer and convert to HF" + ) + parser.add_argument( + "--vocab-size", + "-v", + type=int, + help="Vocabulary size of the tokenizer", + ) + parser.add_argument( + "--dataset-name", + "-d", + type=str, + help="Dataset name with or without delphi-suite/ prefix", + ) + parser.add_argument( + "--split", + "-s", + type=str, + default="train", + help="Split of the dataset to be used for training, supports slicing like 'train[:10%%]'", + ) + parser.add_argument( + "--column", + "-c", + type=str, + help="Column of the dataset to be used for training", + ) + parser.add_argument( + "--repo-id", + "-r", + type=str, + help="Hugging Face repository ID", + ) + parser.add_argument( + "--hf-token", + "-t", + type=str, + help="Hugging Face API token", + ) + args = parser.parse_args() + main( + vocab_size=args.vocab_size, + dataset_name=args.dataset_name, + split=args.split, + column=args.column, + repo_id=args.repo_id, + hf_token=args.hf_token, + ) diff --git a/src/delphi/train/tokenizer.py b/src/delphi/train/tokenizer.py new file mode 100644 index 00000000..190ccdee --- /dev/null +++ b/src/delphi/train/tokenizer.py @@ -0,0 +1,81 @@ +import io +import os +import tempfile +from typing import cast + +from datasets import Dataset, load_dataset +from sentencepiece import SentencePieceProcessor, SentencePieceTrainer +from tokenizers import SentencePieceBPETokenizer # type: ignore +from transformers import LlamaTokenizerFast + + +def hf_dataset_to_text( + dataset_name: str, split: str, column: str, text_file: io.TextIOBase +): + dataset = cast(Dataset, load_dataset(dataset_name, split=split)) + for text in dataset[column]: + text = text.strip() + text_file.write(text + "\n") + + +def train_sentence_piece( + vocab_size: int, + sentence_iterator: io.TextIOBase, +) -> SentencePieceProcessor: + """Trains a custom SentencePiece tokenizer.""" + model = io.BytesIO() + SentencePieceTrainer.train( # type: ignore + sentence_iterator=sentence_iterator, + model_writer=model, + model_type="bpe", + vocab_size=vocab_size, + self_test_sample_size=0, + character_coverage=1.0, + num_threads=os.cpu_count(), + split_digits=True, + allow_whitespace_only_pieces=True, + byte_fallback=True, + unk_surface=r" \342\201\207 ", + normalization_rule_name="identity", + ) + return SentencePieceProcessor(model_proto=model.getvalue()) # type: ignore + + +def sp_processor_to_hf_bpe_tokenizer( + sp_processor: SentencePieceProcessor, +) -> SentencePieceBPETokenizer: + """Converts a SentencePieceProcessor to a SentencePieceBPETokenizer.""" + vocab = { + sp_processor.id_to_piece(index): index # type: ignore + for index in range(sp_processor.GetPieceSize()) + } + merges = [] + for piece_l in vocab.keys(): + for piece_r in vocab.keys(): + merge = f"{piece_l}{piece_r}" + piece_id = vocab.get(merge, None) + if piece_id: + merges += [(piece_l, piece_r, piece_id)] + merges = sorted(merges, key=lambda val: val[2]) + merges = [(val[0], val[1]) for val in merges] + + return SentencePieceBPETokenizer(vocab, merges) + + +def hf_bpe_tokenizer_to_llama_tokenizer( + hf_bpe_tokenizer: SentencePieceBPETokenizer, +) -> LlamaTokenizerFast: + with tempfile.NamedTemporaryFile(mode="w+", suffix=".json") as tmp_json_file: + hf_bpe_tokenizer.save(tmp_json_file.name) + return LlamaTokenizerFast( + tokenizer_file=tmp_json_file.name, + unk_token="", + unk_token_id=0, + bos_token="", + bos_token_id=1, + eos_token="", + eos_token_id=2, + pad_token="", + pad_token_id=3, + padding_side="right", + ) diff --git a/tests/scripts/functional_test_generate_logprobs.sh b/tests/scripts/functional_test_generate_logprobs.sh deleted file mode 100644 index 95085645..00000000 --- a/tests/scripts/functional_test_generate_logprobs.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -#test to check if whether inference.py uploads log probabilities to Hugging Face. -#similar to generate_logprobs.sh, much smaller. - -BATCH_SIZE=80 -DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized" -USERNAME="transcendingvictor" # Your Hugging Face username -TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaa" # Your Hugging Face API token - -# List of models -declare -a MODEL_NAMES=("delphi-suite/delphi-llama2-100k" - "delphi-suite/delphi-llama2-200k" - ) - -# Loop through each model and generate log probabilities -for MODEL_NAME in "${MODEL_NAMES[@]}" -do - echo "Processing $MODEL_NAME" - python scripts/inference.py "$MODEL_NAME" --batch-size "$BATCH_SIZE" --dataset-name "$DATASET_NAME" --username "$USERNAME" --token "$TOKEN" --test-funct -done - -echo "All models processed." \ No newline at end of file