diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index a1e6cb0e03c4..d5a8d4126aef 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -505,10 +505,13 @@ def get_transformer_config(nemo_model_config): tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") tokenizer_path_nemo2 = os.path.join(nemo_export_dir, "nemo_context") + vocab_path = os.path.join(nemo_export_dir, "vocab.json") if os.path.exists(tokenizer_path): shutil.copy(tokenizer_path, self.model_dir) elif os.path.exists(tokenizer_path_nemo2): shutil.copytree(tokenizer_path_nemo2, Path(self.model_dir) / "nemo_context") + elif os.path.exists(vocab_path): + shutil.copy(vocab_path, os.path.join(self.model_dir, "vocab.json")) else: self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer')) diff --git a/nemo/export/tiktoken_tokenizer.py b/nemo/export/tiktoken_tokenizer.py new file mode 100644 index 000000000000..d599620256fa --- /dev/null +++ b/nemo/export/tiktoken_tokenizer.py @@ -0,0 +1,123 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import json +from pathlib import Path +from typing import Dict, Optional + +import numpy as np +import tiktoken +import torch + +PATTERN_TIKTOKEN = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" +DEFAULT_TIKTOKEN_MAX_VOCAB = 2**17 # 131072 +SPECIAL_TOKENS = ["", "", ""] +SPECIAL_TOKEN_TEMPLATE = "" + + +def reload_mergeable_ranks( + path: str, + max_vocab: Optional[int] = None, +) -> Dict[bytes, int]: + """ + Reload the tokenizer JSON file and convert it to Tiktoken format. + """ + assert path.endswith(".json") + + # reload vocab + with open(path, "r", encoding='utf-8') as f: + vocab = json.load(f) + assert isinstance(vocab, list) + print(f"Vocab size: {len(vocab)}") + if max_vocab is not None: + vocab = vocab[:max_vocab] + print(f"Cutting vocab to first {len(vocab)} tokens.") + + # build ranks + ranks: Dict[bytes, int] = {} + for i, x in enumerate(vocab): + assert x.keys() == {"rank", "token_bytes", "token_str"} + assert x["rank"] == i + merge = base64.b64decode(x["token_bytes"]) + assert i >= 256 or merge == bytes([i]) + ranks[merge] = x["rank"] + + # sanity check + assert len(ranks) == len(vocab) + assert set(ranks.values()) == set(range(len(ranks))) + + return ranks + + +class TiktokenTokenizer: + def __init__(self, vocab_file: str): + + self.num_special_tokens = 1000 + vocab_size = DEFAULT_TIKTOKEN_MAX_VOCAB + pattern = PATTERN_TIKTOKEN + special_tokens = SPECIAL_TOKENS.copy() + inner_vocab_size = vocab_size - self.num_special_tokens + + token2id = reload_mergeable_ranks(vocab_file, max_vocab=inner_vocab_size) + self.tokenizer = tiktoken.Encoding( + name=Path(vocab_file).parent.name, + pat_str=pattern, + mergeable_ranks=token2id, + special_tokens={}, # special tokens are handled manually + ) + + # BOS / EOS / Pad token IDs + self._bos_id = special_tokens.index("") + self._eos_id = special_tokens.index("") + + def encode(self, text): + tokens = self.tokenizer.encode(text) + tokens = [t + self.num_special_tokens for t in tokens] + return tokens + + def decode(self, tokens): + # Filter out special tokens and adjust the remaining tokens + adjusted_tokens = [ + t - self.num_special_tokens + for t in tokens + if t not in {self._bos_id, self._eos_id} and t >= self.num_special_tokens + ] + + # Decode only if there are tokens left after filtering + if adjusted_tokens: + return self.tokenizer.decode(adjusted_tokens) + else: + return "" # Return an empty string if all tokens were filtered out + + def batch_decode(self, ids): + if isinstance(ids, np.ndarray) or torch.is_tensor(ids): + ids = ids.tolist() + + if isinstance(ids[0], list): + ids = ids[0] + + return self.decode(ids) + + @property + def pad_id(self): + return self._eos_id + + @property + def bos_token_id(self): + return self._bos_id + + @property + def eos_token_id(self): + return self._eos_id diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py index 9ace6425f533..0c634439268d 100644 --- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py +++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py @@ -36,6 +36,7 @@ from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer from nemo.export.tarutils import TarPath, ZarrPathStore +from nemo.export.tiktoken_tokenizer import TiktokenTokenizer LOGGER = logging.getLogger("NeMo") @@ -235,7 +236,7 @@ def load_sharded_metadata(checkpoint_dir: Union[Path, TarPath], torch_tensor=Tru def update_tokenizer_paths(tokenizer_config: Dict, unpacked_checkpoints_dir): def _update_config_entry(key, file_pattern): - old_path = tokenizer_config[key] + old_path = tokenizer_config.get(key, None) if old_path is None: return old_path = Path(old_path) @@ -262,7 +263,7 @@ def copy_tokenizer_files(config, out_dir): } for key in basenames.keys(): - if config[key] is None: + if config.get(key, None) is None: continue path = config[key] @@ -275,6 +276,7 @@ def copy_tokenizer_files(config, out_dir): continue dst_path = out_dir / f"{basenames[key]}{path.suffix}" + config[key] = str(dst_path) LOGGER.debug(f"Copy tokenizer {key}: {path}->{dst_path}") # Copy 'path' to 'dst_path' without shutil.copy(...) because 'path' may be a TarPath @@ -282,6 +284,8 @@ def copy_tokenizer_files(config, out_dir): with open(dst_path, 'wb') as outfile: outfile.write(infile.read()) + return config + def get_tokenizer(tokenizer_dir_or_path: Union[str, Path]) -> PreTrainedTokenizer: """Loads the tokenizer from the decoded NeMo weights dir.""" @@ -291,6 +295,10 @@ def get_tokenizer(tokenizer_dir_or_path: Union[str, Path]) -> PreTrainedTokenize tokenizer_spec = io.load_context((tokenizer_dir_or_path / "nemo_context"), subpath="model.tokenizer") return build_tokenizer(tokenizer_spec) + elif os.path.exists(os.path.join(tokenizer_dir_or_path, "vocab.json")): + vocab_path = tokenizer_dir_or_path / "vocab.json" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path + tokenizer_config = {"library": "tiktoken", "vocab_file": str(vocab_path)} + return build_tokenizer(tokenizer_config) else: if (tokenizer_dir_or_path / "huggingface_tokenizer").is_dir(): return AutoTokenizer.from_pretrained(tokenizer_dir_or_path / "huggingface_tokenizer") @@ -307,6 +315,8 @@ def build_tokenizer(tokenizer): tokenizer_config = tokenizer if tokenizer_config["library"] == "sentencepiece": return SentencePieceTokenizer(model_path=tokenizer_config["model"]) + elif tokenizer_config["library"] == "tiktoken": + return TiktokenTokenizer(vocab_file=tokenizer_config["vocab_file"]) elif "GPT2" in tokenizer_config["type"]: tokenizer = GPT2Tokenizer(tokenizer_config["vocab_file"], tokenizer_config["merge_file"]) else: @@ -373,9 +383,8 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat ) else: tokenizer_config = update_tokenizer_paths(nemo_model_config["tokenizer"], unpacked_checkpoint_dir) - copy_tokenizer_files(tokenizer_config, nemo_export_dir) + tokenizer_config = copy_tokenizer_files(tokenizer_config, nemo_export_dir) - tokenizer_config["model"] = os.path.join(nemo_export_dir, "tokenizer.model") tokenizer = build_tokenizer(tokenizer_config) elif (nemo_dir / "weights").exists(): dist_ckpt_folder = nemo_dir / "weights" diff --git a/nemo/export/trt_llm/qnemo/tokenizer_utils.py b/nemo/export/trt_llm/qnemo/tokenizer_utils.py index beca40bcd3d7..d97664f3dd5a 100644 --- a/nemo/export/trt_llm/qnemo/tokenizer_utils.py +++ b/nemo/export/trt_llm/qnemo/tokenizer_utils.py @@ -13,11 +13,14 @@ # limitations under the License. import os +import shutil +import tempfile from omegaconf import OmegaConf from transformers import AutoTokenizer from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer +from nemo.export.tiktoken_tokenizer import TiktokenTokenizer # TODO: use get_nmt_tokenizer helper below to instantiate tokenizer once environment / dependencies get stable # from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer @@ -43,6 +46,9 @@ def get_nmt_tokenizer(nemo_checkpoint_path: str): tokenizer = SentencePieceTokenizer( model_path=os.path.join(nemo_checkpoint_path, tokenizer_cfg.model), legacy=legacy ) + elif library == "tiktoken": + print(f"Getting TiktokenTokenizer with file: {tokenizer_cfg.vocab_file}") + tokenizer = TiktokenTokenizer(vocab_file=os.path.join(nemo_checkpoint_path, tokenizer_cfg.vocab_file)) else: raise NotImplementedError("Currently we only support 'huggingface' and 'sentencepiece' tokenizer libraries.")