Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Tiktoken support for TRTLLM #10306

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
3 changes: 3 additions & 0 deletions nemo/export/tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,10 +505,13 @@ def get_transformer_config(nemo_model_config):

tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
tokenizer_path_nemo2 = os.path.join(nemo_export_dir, "nemo_context")
vocab_path = os.path.join(nemo_export_dir, "vocab.json")
if os.path.exists(tokenizer_path):
shutil.copy(tokenizer_path, self.model_dir)
elif os.path.exists(tokenizer_path_nemo2):
shutil.copytree(tokenizer_path_nemo2, Path(self.model_dir) / "nemo_context")
elif os.path.exists(vocab_path):
shutil.copy(vocab_path, os.path.join(self.model_dir, "vocab.json"))
else:
self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer'))

Expand Down
123 changes: 123 additions & 0 deletions nemo/export/tiktoken_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import base64
import json
from pathlib import Path
from typing import Dict, Optional

import numpy as np
import tiktoken
import torch

PATTERN_TIKTOKEN = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
DEFAULT_TIKTOKEN_MAX_VOCAB = 2**17 # 131072
SPECIAL_TOKENS = ["<unk>", "<s>", "</s>"]
SPECIAL_TOKEN_TEMPLATE = "<SPECIAL_{id}>"


def reload_mergeable_ranks(
path: str,
max_vocab: Optional[int] = None,
) -> Dict[bytes, int]:
"""
Reload the tokenizer JSON file and convert it to Tiktoken format.
"""
assert path.endswith(".json")

# reload vocab
with open(path, "r", encoding='utf-8') as f:
vocab = json.load(f)
assert isinstance(vocab, list)
print(f"Vocab size: {len(vocab)}")
if max_vocab is not None:
vocab = vocab[:max_vocab]
print(f"Cutting vocab to first {len(vocab)} tokens.")

# build ranks
ranks: Dict[bytes, int] = {}
for i, x in enumerate(vocab):
assert x.keys() == {"rank", "token_bytes", "token_str"}
assert x["rank"] == i
merge = base64.b64decode(x["token_bytes"])
assert i >= 256 or merge == bytes([i])
ranks[merge] = x["rank"]

# sanity check
assert len(ranks) == len(vocab)
assert set(ranks.values()) == set(range(len(ranks)))

return ranks


class TiktokenTokenizer:
def __init__(self, vocab_file: str):

self.num_special_tokens = 1000
vocab_size = DEFAULT_TIKTOKEN_MAX_VOCAB
pattern = PATTERN_TIKTOKEN
special_tokens = SPECIAL_TOKENS.copy()
inner_vocab_size = vocab_size - self.num_special_tokens

token2id = reload_mergeable_ranks(vocab_file, max_vocab=inner_vocab_size)
self.tokenizer = tiktoken.Encoding(
name=Path(vocab_file).parent.name,
pat_str=pattern,
mergeable_ranks=token2id,
special_tokens={}, # special tokens are handled manually
)

# BOS / EOS / Pad token IDs
self._bos_id = special_tokens.index("<s>")
self._eos_id = special_tokens.index("</s>")

def encode(self, text):
tokens = self.tokenizer.encode(text)
tokens = [t + self.num_special_tokens for t in tokens]
return tokens

def decode(self, tokens):
# Filter out special tokens and adjust the remaining tokens
adjusted_tokens = [
t - self.num_special_tokens
for t in tokens
if t not in {self._bos_id, self._eos_id} and t >= self.num_special_tokens
]

# Decode only if there are tokens left after filtering
if adjusted_tokens:
return self.tokenizer.decode(adjusted_tokens)
else:
return "" # Return an empty string if all tokens were filtered out

def batch_decode(self, ids):
if isinstance(ids, np.ndarray) or torch.is_tensor(ids):
ids = ids.tolist()

if isinstance(ids[0], list):
ids = ids[0]

return self.decode(ids)

@property
def pad_id(self):
return self._eos_id

@property
def bos_token_id(self):
return self._bos_id

@property
def eos_token_id(self):
return self._eos_id
17 changes: 13 additions & 4 deletions nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
from nemo.export.tarutils import TarPath, ZarrPathStore
from nemo.export.tiktoken_tokenizer import TiktokenTokenizer

LOGGER = logging.getLogger("NeMo")

Expand Down Expand Up @@ -235,7 +236,7 @@ def load_sharded_metadata(checkpoint_dir: Union[Path, TarPath], torch_tensor=Tru

def update_tokenizer_paths(tokenizer_config: Dict, unpacked_checkpoints_dir):
def _update_config_entry(key, file_pattern):
old_path = tokenizer_config[key]
old_path = tokenizer_config.get(key, None)
if old_path is None:
return
old_path = Path(old_path)
Expand All @@ -262,7 +263,7 @@ def copy_tokenizer_files(config, out_dir):
}

for key in basenames.keys():
if config[key] is None:
if config.get(key, None) is None:
continue

path = config[key]
Expand All @@ -275,13 +276,16 @@ def copy_tokenizer_files(config, out_dir):
continue

dst_path = out_dir / f"{basenames[key]}{path.suffix}"
config[key] = str(dst_path)
LOGGER.debug(f"Copy tokenizer {key}: {path}->{dst_path}")

# Copy 'path' to 'dst_path' without shutil.copy(...) because 'path' may be a TarPath
with path.open('rb') as infile:
with open(dst_path, 'wb') as outfile:
outfile.write(infile.read())

return config


def get_tokenizer(tokenizer_dir_or_path: Union[str, Path]) -> PreTrainedTokenizer:
"""Loads the tokenizer from the decoded NeMo weights dir."""
Expand All @@ -291,6 +295,10 @@ def get_tokenizer(tokenizer_dir_or_path: Union[str, Path]) -> PreTrainedTokenize

tokenizer_spec = io.load_context((tokenizer_dir_or_path / "nemo_context"), subpath="model.tokenizer")
return build_tokenizer(tokenizer_spec)
elif os.path.exists(os.path.join(tokenizer_dir_or_path, "vocab.json")):
vocab_path = tokenizer_dir_or_path / "vocab.json" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
tokenizer_config = {"library": "tiktoken", "vocab_file": str(vocab_path)}
return build_tokenizer(tokenizer_config)
else:
if (tokenizer_dir_or_path / "huggingface_tokenizer").is_dir():
return AutoTokenizer.from_pretrained(tokenizer_dir_or_path / "huggingface_tokenizer")
Expand All @@ -307,6 +315,8 @@ def build_tokenizer(tokenizer):
tokenizer_config = tokenizer
if tokenizer_config["library"] == "sentencepiece":
return SentencePieceTokenizer(model_path=tokenizer_config["model"])
elif tokenizer_config["library"] == "tiktoken":
return TiktokenTokenizer(vocab_file=tokenizer_config["vocab_file"])
elif "GPT2" in tokenizer_config["type"]:
tokenizer = GPT2Tokenizer(tokenizer_config["vocab_file"], tokenizer_config["merge_file"])
else:
Expand Down Expand Up @@ -373,9 +383,8 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat
)
else:
tokenizer_config = update_tokenizer_paths(nemo_model_config["tokenizer"], unpacked_checkpoint_dir)
copy_tokenizer_files(tokenizer_config, nemo_export_dir)
tokenizer_config = copy_tokenizer_files(tokenizer_config, nemo_export_dir)

tokenizer_config["model"] = os.path.join(nemo_export_dir, "tokenizer.model")
tokenizer = build_tokenizer(tokenizer_config)
elif (nemo_dir / "weights").exists():
dist_ckpt_folder = nemo_dir / "weights"
Expand Down
9 changes: 9 additions & 0 deletions nemo/export/trt_llm/qnemo/tokenizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,14 @@
# limitations under the License.

import os
import shutil
import tempfile

Check notice

Code scanning / CodeQL

Unused import Note

Import of 'shutil' is not used.

from omegaconf import OmegaConf
from transformers import AutoTokenizer

from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
from nemo.export.tiktoken_tokenizer import TiktokenTokenizer

# TODO: use get_nmt_tokenizer helper below to instantiate tokenizer once environment / dependencies get stable
# from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
Expand All @@ -43,6 +46,12 @@ def get_nmt_tokenizer(nemo_checkpoint_path: str):
tokenizer = SentencePieceTokenizer(
model_path=os.path.join(nemo_checkpoint_path, tokenizer_cfg.model), legacy=legacy
)
elif library == "tiktoken":
tmp_dir = tempfile.TemporaryDirectory()
tmp_path = os.path.join(tmp_dir.name, "vocab.json")
vocab_file = os.path.join(nemo_checkpoint_path, tokenizer_cfg.vocab_file)
shutil.copy(vocab_file, tmp_path)
tokenizer = TiktokenTokenizer(vocab_file=tmp_path)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be just tokenizer = TiktokenTokenizer(vocab_file=vocab_file) i.e. w/o creating tmp_dir?

Copy link
Collaborator Author

@meatybobby meatybobby Nov 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@janekl I've changed it

else:
raise NotImplementedError("Currently we only support 'huggingface' and 'sentencepiece' tokenizer libraries.")

Expand Down
Loading