Skip to content

Commit

Permalink
integrate some of the suggested changes (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
jannik-brinkmann committed Apr 5, 2024
1 parent 1b799fc commit 0fccd5a
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 39 deletions.
76 changes: 50 additions & 26 deletions scripts/train_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import argparse
import os
import sentencepiece as spm
import tempfile

from datasets import load_dataset
from tqdm.auto import tqdm, trange
Expand All @@ -12,9 +14,12 @@
def main(
vocab_size: int,
dataset_name: str,
column: str,
train_size: float,
username: str,
repo_id: str,
token: str,
seed: int,
funct_test: bool = False,
):
"""
Expand All @@ -25,23 +30,25 @@ def main(
- dataset_name: The name of the dataset from which validation set will be loaded
- train_size: The amount of the dataset that should be used for training
- username: Hugging Face API username
- repo_id: Hugging Face repository ID
- token: Hugging Face API token
"""
train_ds = load_dataset(dataset_name)["train"]
if train_size < 1.0:
train_ds = train_ds.train_test_split(train_size=train_size)["train"]

tokenizer_model_path = get_tokenizer_model_path(
vocab_size=vocab_size,
)
train_ds = train_ds.train_test_split(
train_size=train_size,
seed=seed
)["train"]

assert vocab_size > 0, "vocab_size should be greater than 0"
tokenizer_model_path = f"tok{vocab_size}.model"
if not os.path.isfile(tokenizer_model_path):
train_vocab(
vocab_size=vocab_size,
dataset=train_ds,
column=column
)

import sentencepiece as spm
tokenizer_model_path = get_tokenizer_model_path(vocab_size=vocab_size)
sp_model = spm.SentencePieceProcessor(model_file=tokenizer_model_path)

# export 'vocab' and 'merges'
Expand All @@ -57,29 +64,29 @@ def main(
merges = [(val[0], val[1]) for val in merges]

# convert to BPE tokenizer
bpe_tokenizer_file_path = f"tok{vocab_size}-sentencepiece-tokenizer.json"
bpe_tokenizer = SentencePieceBPETokenizer(vocab, merges)
bpe_tokenizer.save(bpe_tokenizer_file_path, pretty=True)

# convert to LLaMA Tokenizer
tokenizer = LlamaTokenizerFast(
tokenizer_file=bpe_tokenizer_file_path,
unk_token="<unk>",
unk_token_id=0,
bos_token="<s>",
bos_token_id=1,
eos_token="</s>",
eos_token_id=2,
pad_token="<pad>",
pad_token_id=3,
padding_side="right",
)
os.remove(bpe_tokenizer_file_path)

# Convert to LLaMA Tokenizer
with tempfile.NamedTemporaryFile(mode='w+', suffix='.json') as tmpfile:
bpe_tokenizer.save(tmpfile.name, pretty=True)
tmpfile.seek(0)
tokenizer = LlamaTokenizerFast(
tokenizer_file=tmpfile.name,
unk_token="<unk>",
unk_token_id=0,
bos_token="<s>",
bos_token_id=1,
eos_token="</s>",
eos_token_id=2,
pad_token="<pad>",
pad_token_id=3,
padding_side="right",
)
print("Converted tokenizer to huggingface tokenizer.")

# push tokenizer to the hub
tokenizer.push_to_hub(
repo_id="jbrinkma/tokenizer_test",
repo_id=repo_id,
)
print("Pushed tokenizer to huggingface hub.")

Expand All @@ -98,9 +105,13 @@ def main(
type=str,
help="Dataset name with or without delphi-suite/ prefix",
)
parser.add_argument(
"--column",
type=str,
help="Column of the dataset to be used for training",
)
parser.add_argument(
"--train-size",
type=float,
help="Subset of the dataset to be used for training",
default=1.0,
)
Expand All @@ -109,11 +120,21 @@ def main(
type=str,
help="Hugging Face API username",
)
parser.add_argument(
"--repo-name",
type=str,
help="Hugging Face API username",
)
parser.add_argument(
"--token",
type=str,
help="Hugging Face API token",
)
parser.add_argument(
"--seed",
type=int,
help="Seed",
)
parser.add_argument(
"--test-funct", action="store_true", help="Enable test function mode"
)
Expand All @@ -124,7 +145,10 @@ def main(
args.vocab_size,
args.dataset_name,
args.train_size,
args.column,
args.username,
args.repo_id,
args.token,
args.seed,
args.test_funct,
)
15 changes: 2 additions & 13 deletions src/delphi/train/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,10 @@
from datasets import Dataset


def get_tokenizer_model_path(
vocab_size: int,
cache_dir: str = "cache"
) -> str:
"""
Returns path to the SentencePiece tokenizer model for a given vocab size.
"""
if vocab_size == 0:
return ""
else:
return os.path.join(cache_dir, f"tok{vocab_size}.model")

def train_vocab(
vocab_size: int,
dataset: Dataset,
column: str,
cache_dir: str = "cache"
) -> None:
"""
Expand All @@ -33,7 +22,7 @@ def train_vocab(
text_file = os.path.join(cache_dir, "text.txt")
with open(text_file, 'w', encoding='utf-8') as file:
for item in dataset:
text = item['story']
text = item[column]
text = text.strip()
file.write(text + '\n')
print(f"Size is: {os.path.getsize(text_file) / 1024 / 1024:.2f} MB")
Expand Down

0 comments on commit 0fccd5a

Please sign in to comment.