From d948c1acd7d8a2b7e41bcdc28155f2ceca2af0e1 Mon Sep 17 00:00:00 2001 From: Guang Yang Date: Mon, 8 Jul 2024 15:45:22 -0700 Subject: [PATCH] Bug fix in bpe tokenizer (#4149) Summary: - Record bos/eos in the binary format - Updated tests Pull Request resolved: https://github.com/pytorch/executorch/pull/4149 Reviewed By: larryliu0820 Differential Revision: D59349794 Pulled By: guangy10 fbshipit-source-id: ecdd5bd22dfcdc60429d179f07a61d46c832ef87 --- .gitignore | 3 +++ .../models/llama2/tokenizer/bpe_tokenizer.cpp | 21 +++++++++--------- .../models/llama2/tokenizer/bpe_tokenizer.h | 5 ++++- .../{test.bin => test_bpe_tokenizer.bin} | Bin 8 -> 16 bytes ...er.model => test_tiktoken_tokenizer.model} | 0 .../models/llama2/tokenizer/test/targets.bzl | 4 ++-- .../tokenizer/test/test_bpe_tokenizer.cpp | 11 ++++----- ...est_tokenizer.py => test_bpe_tokenizer.py} | 8 ++++++- .../llama2/tokenizer/test/test_tiktoken.cpp | 20 ++++------------- examples/models/llama2/tokenizer/tokenizer.py | 12 +++++++--- 10 files changed, 46 insertions(+), 38 deletions(-) rename examples/models/llama2/tokenizer/test/resources/{test.bin => test_bpe_tokenizer.bin} (50%) rename examples/models/llama2/tokenizer/test/resources/{tokenizer.model => test_tiktoken_tokenizer.model} (100%) rename examples/models/llama2/tokenizer/test/{test_tokenizer.py => test_bpe_tokenizer.py} (82%) diff --git a/.gitignore b/.gitignore index 26a46f23f6..d766479b11 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,10 @@ __pycache__/ # Any exported models and profiling outputs *.pte +*.model +!test_tiktoken_tokenizer.model *.bin +!test_bpe_tokenizer.bin # Editor temporaries *.swa diff --git a/examples/models/llama2/tokenizer/bpe_tokenizer.cpp b/examples/models/llama2/tokenizer/bpe_tokenizer.cpp index 077a195515..1ac2262a65 100644 --- a/examples/models/llama2/tokenizer/bpe_tokenizer.cpp +++ b/examples/models/llama2/tokenizer/bpe_tokenizer.cpp @@ -24,12 +24,12 @@ static int compare_tokens(const void* a, const void* b) { } BPETokenizer::BPETokenizer() : Tokenizer() { - vocab_size_ = kVocabSize; - vocab_ = std::make_unique(kVocabSize); - vocab_scores_ = std::make_unique(kVocabSize); - sorted_vocab_ = std::make_unique(kVocabSize); - bos_tok_ = 1; - eos_tok_ = 2; + vocab_size_ = kDefaultVocabSize; + vocab_ = std::make_unique(kDefaultVocabSize); + vocab_scores_ = std::make_unique(kDefaultVocabSize); + sorted_vocab_ = std::make_unique(kDefaultVocabSize); + bos_tok_ = kDefaultBosTokenId; + eos_tok_ = kDefaultEosTokenId; for (int i = 0; i < 256; i++) { byte_pieces_[i * 2] = (unsigned char)i; byte_pieces_[i * 2 + 1] = '\0'; @@ -57,8 +57,8 @@ Error BPETokenizer::load(const std::string& tokenizer_path) { ET_LOG(Error, "couldn't load %s", tokenizer_path.c_str()); return Error::InvalidArgument; } - int32_t metadata[2]; - for (int i = 0; i < 2; i++) { + int32_t metadata[4]; + for (int i = 0; i < 4; i++) { if (fread(metadata + i, sizeof(int32_t), 1, file) != 1) { ET_LOG( Error, @@ -72,8 +72,9 @@ Error BPETokenizer::load(const std::string& tokenizer_path) { // tokenizer file. int32_t tokenizer_vocab_size = metadata[0]; vocab_size_ = tokenizer_vocab_size; - - max_token_length_ = metadata[1]; + bos_tok_ = metadata[1]; + eos_tok_ = metadata[2]; + max_token_length_ = metadata[3]; // allocate space for the vocabulary vocab_ = std::make_unique(vocab_size_); diff --git a/examples/models/llama2/tokenizer/bpe_tokenizer.h b/examples/models/llama2/tokenizer/bpe_tokenizer.h index 8cbed98b7c..3615ce6990 100644 --- a/examples/models/llama2/tokenizer/bpe_tokenizer.h +++ b/examples/models/llama2/tokenizer/bpe_tokenizer.h @@ -14,7 +14,10 @@ namespace torch { namespace executor { -constexpr int32_t kVocabSize = 32000; +// Default values for llama2 +constexpr int32_t kDefaultVocabSize = 32000; +constexpr uint64_t kDefaultBosTokenId = 1; +constexpr uint64_t kDefaultEosTokenId = 2; struct TokenIndex { const char* str; diff --git a/examples/models/llama2/tokenizer/test/resources/test.bin b/examples/models/llama2/tokenizer/test/resources/test_bpe_tokenizer.bin similarity index 50% rename from examples/models/llama2/tokenizer/test/resources/test.bin rename to examples/models/llama2/tokenizer/test/resources/test_bpe_tokenizer.bin index 1b1cb4d44c57c2d7a5122870fa6ac3e62ff7e94e..01d633b27e8ea9b17084fc911d0c8cc43a4170a9 100644 GIT binary patch literal 16 KcmZQzKm`B*5C8!H literal 8 KcmZQzfB*mh2mk>9 diff --git a/examples/models/llama2/tokenizer/test/resources/tokenizer.model b/examples/models/llama2/tokenizer/test/resources/test_tiktoken_tokenizer.model similarity index 100% rename from examples/models/llama2/tokenizer/test/resources/tokenizer.model rename to examples/models/llama2/tokenizer/test/resources/test_tiktoken_tokenizer.model diff --git a/examples/models/llama2/tokenizer/test/targets.bzl b/examples/models/llama2/tokenizer/test/targets.bzl index 1709078ab8..532767cb3f 100644 --- a/examples/models/llama2/tokenizer/test/targets.bzl +++ b/examples/models/llama2/tokenizer/test/targets.bzl @@ -44,9 +44,9 @@ def define_common_targets(): ) runtime.python_test( - name = "test_tokenizer_py", + name = "test_bpe_tokenizer_py", srcs = [ - "test_tokenizer.py", + "test_bpe_tokenizer.py", ], visibility = [ "//executorch/examples/...", diff --git a/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp b/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp index 206657ebe1..dd7da30e60 100644 --- a/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp +++ b/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp @@ -22,7 +22,8 @@ class TokenizerExtensionTest : public Test { void SetUp() override { torch::executor::runtime_init(); tokenizer_ = std::make_unique(); - modelPath_ = std::getenv("RESOURCES_PATH") + std::string("/test.bin"); + modelPath_ = + std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin"); } std::unique_ptr tokenizer_; @@ -47,13 +48,13 @@ TEST_F(TokenizerExtensionTest, DecodeOutOfRangeFails) { EXPECT_EQ(result.error(), Error::NotSupported); } -TEST_F(TokenizerExtensionTest, TokenizerVocabSizeIsExpected) { +TEST_F(TokenizerExtensionTest, TokenizerMetadataIsExpected) { Error res = tokenizer_->load(modelPath_.c_str()); EXPECT_EQ(res, Error::Ok); - // test.bin has vocab size 0. + // test_bpe_tokenizer.bin has vocab_size 0, bos_id 0, eos_id 0 recorded. EXPECT_EQ(tokenizer_->vocab_size(), 0); - EXPECT_EQ(tokenizer_->bos_tok(), 1); - EXPECT_EQ(tokenizer_->eos_tok(), 2); + EXPECT_EQ(tokenizer_->bos_tok(), 0); + EXPECT_EQ(tokenizer_->eos_tok(), 0); } } // namespace executor diff --git a/examples/models/llama2/tokenizer/test/test_tokenizer.py b/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.py similarity index 82% rename from examples/models/llama2/tokenizer/test/test_tokenizer.py rename to examples/models/llama2/tokenizer/test/test_bpe_tokenizer.py index 9c195a8403..0461323598 100644 --- a/examples/models/llama2/tokenizer/test/test_tokenizer.py +++ b/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.py @@ -20,6 +20,8 @@ class TestTokenizer(unittest.TestCase): def test_export(self, mock_sp): # Set up the mock SentencePieceProcessor mock_sp.return_value.vocab_size.return_value = 0 + mock_sp.return_value.bos_id.return_value = 0 + mock_sp.return_value.eos_id.return_value = 0 mock_sp.return_value.get_piece_size.return_value = 0 # Create a temporary file with tempfile.NamedTemporaryFile(delete=True) as temp: @@ -32,8 +34,12 @@ def test_export(self, mock_sp): with open(output.name, "rb") as f: data = f.read(16) # Unpack the data as 4 integers - vocab_size, max_token_length = struct.unpack("II", data) + vocab_size, bos_id, eos_id, max_token_length = struct.unpack( + "IIII", data + ) # Check that the integers match the properties of the tokenizer self.assertEqual(vocab_size, 0) + self.assertEqual(bos_id, 0) + self.assertEqual(eos_id, 0) # Check that the max token length is correct self.assertEqual(max_token_length, 0) diff --git a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp b/examples/models/llama2/tokenizer/test/test_tiktoken.cpp index 8ee768f4bf..dcc1c356b3 100644 --- a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp +++ b/examples/models/llama2/tokenizer/test/test_tiktoken.cpp @@ -22,8 +22,8 @@ class TiktokenExtensionTest : public Test { void SetUp() override { torch::executor::runtime_init(); tokenizer_ = std::make_unique(); - modelPath_ = - std::getenv("RESOURCES_PATH") + std::string("/tokenizer.model"); + modelPath_ = std::getenv("RESOURCES_PATH") + + std::string("/test_tiktoken_tokenizer.model"); } std::unique_ptr tokenizer_; @@ -35,8 +35,8 @@ class MultimodalTiktokenV5ExtensionTest : public Test { void SetUp() override { torch::executor::runtime_init(); tokenizer_ = std::make_unique(MULTIMODAL); - modelPath_ = - std::getenv("RESOURCES_PATH") + std::string("/tokenizer.model"); + modelPath_ = std::getenv("RESOURCES_PATH") + + std::string("/test_tiktoken_tokenizer.model"); } std::unique_ptr tokenizer_; @@ -56,8 +56,6 @@ TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) { TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) { Error res = tokenizer_->load(modelPath_.c_str()); EXPECT_EQ(res, Error::Ok); - // test.bin has vocab size 0 but the tokenizer respects the vocab size being - // passed in and add placeholder tokens. EXPECT_EQ(tokenizer_->vocab_size(), 128256); EXPECT_EQ(tokenizer_->bos_tok(), 128000); EXPECT_EQ(tokenizer_->eos_tok(), 128001); @@ -66,8 +64,6 @@ TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) { TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) { Error res = tokenizer_->load(modelPath_.c_str()); EXPECT_EQ(res, Error::Ok); - // test.bin has vocab size 0 but the tokenizer respects the vocab size being - // passed in and add placeholder tokens. EXPECT_EQ(tokenizer_->vocab_size(), 128256); EXPECT_EQ(tokenizer_->bos_tok(), 128000); EXPECT_EQ(tokenizer_->eos_tok(), 128001); @@ -76,8 +72,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) { TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) { Error res = tokenizer_->load(modelPath_.c_str()); EXPECT_EQ(res, Error::Ok); - // test.bin has vocab size 0 but the tokenizer respects the vocab size being - // passed in and add placeholder tokens. Result> out = tokenizer_->encode("hello world", 1, 0); EXPECT_EQ(out.error(), Error::Ok); EXPECT_EQ(out.get().size(), 3); @@ -89,8 +83,6 @@ TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) { TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) { Error res = tokenizer_->load(modelPath_.c_str()); EXPECT_EQ(res, Error::Ok); - // test.bin has vocab size 0 but the tokenizer respects the vocab size being - // passed in and add placeholder tokens. Result> out = tokenizer_->encode( "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n<|image|>What do you think is going on in this snapshot?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAmidst a scenic garden backdrop, a man dressed in a suit with a distinct button on its lower portion stands prominently.<|eom_id|>", 0, @@ -112,8 +104,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) { TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) { Error res = tokenizer_->load(modelPath_.c_str()); EXPECT_EQ(res, Error::Ok); - // test.bin has vocab size 0 but the tokenizer respects the vocab size being - // passed in and add placeholder tokens. std::vector expected = {"<|begin_of_text|>", "hello", " world"}; std::vector tokens = {128000, 15339, 1917}; for (size_t i = 0; i < tokens.size(); i++) { @@ -126,8 +116,6 @@ TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) { TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) { Error res = tokenizer_->load(modelPath_.c_str()); EXPECT_EQ(res, Error::Ok); - // test.bin has vocab size 0 but the tokenizer respects the vocab size being - // passed in and add placeholder tokens. std::vector expected = { "<|begin_of_text|>", "<|start_header_id|>", diff --git a/examples/models/llama2/tokenizer/tokenizer.py b/examples/models/llama2/tokenizer/tokenizer.py index a63aa2d1b3..ecd0231fb6 100644 --- a/examples/models/llama2/tokenizer/tokenizer.py +++ b/examples/models/llama2/tokenizer/tokenizer.py @@ -58,8 +58,10 @@ def export(self, output_path: str, *, prepend_padding: bool = False) -> None: The binary format is: 1. vocab size: int32 - 2. max token length: int32 - 3. score: float32, len of bytes: int32, token bytes: [byte] for each token + 2. bos token id: int32 + 3. eos token id: int32 + 4. max token length: int32 + 5. score: float32, len of bytes: int32, token bytes: [byte] for each token :param output_path: output path of the new binary. :param prepend_padding: a boolean to control if we want to prepend a padding token. @@ -99,7 +101,11 @@ def export(self, output_path: str, *, prepend_padding: bool = False) -> None: # write to a binary file with open(output_path, "wb") as f: # write the vocab size, bos/eos ids and max token length - f.write(struct.pack("II", self.n_words, max_token_length)) + f.write( + struct.pack( + "IIII", self.n_words, self.bos_id, self.eos_id, max_token_length + ) + ) for bytes, score in zip(tokens, scores): f.write(struct.pack("fI", score, len(bytes))) f.write(bytes)