Skip to content

Commit

Permalink
Bug fix in bpe tokenizer (pytorch#4149)
Browse files Browse the repository at this point in the history
Summary:
- Record bos/eos in the binary format
- Updated tests

Pull Request resolved: pytorch#4149

Reviewed By: larryliu0820

Differential Revision: D59349794

Pulled By: guangy10

fbshipit-source-id: ecdd5bd22dfcdc60429d179f07a61d46c832ef87
  • Loading branch information
guangy10 authored and facebook-github-bot committed Jul 8, 2024
1 parent 5f2ab0e commit d948c1a
Show file tree
Hide file tree
Showing 10 changed files with 46 additions and 38 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ __pycache__/

# Any exported models and profiling outputs
*.pte
*.model
!test_tiktoken_tokenizer.model
*.bin
!test_bpe_tokenizer.bin

# Editor temporaries
*.swa
Expand Down
21 changes: 11 additions & 10 deletions examples/models/llama2/tokenizer/bpe_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ static int compare_tokens(const void* a, const void* b) {
}

BPETokenizer::BPETokenizer() : Tokenizer() {
vocab_size_ = kVocabSize;
vocab_ = std::make_unique<char*[]>(kVocabSize);
vocab_scores_ = std::make_unique<float[]>(kVocabSize);
sorted_vocab_ = std::make_unique<TokenIndex[]>(kVocabSize);
bos_tok_ = 1;
eos_tok_ = 2;
vocab_size_ = kDefaultVocabSize;
vocab_ = std::make_unique<char*[]>(kDefaultVocabSize);
vocab_scores_ = std::make_unique<float[]>(kDefaultVocabSize);
sorted_vocab_ = std::make_unique<TokenIndex[]>(kDefaultVocabSize);
bos_tok_ = kDefaultBosTokenId;
eos_tok_ = kDefaultEosTokenId;
for (int i = 0; i < 256; i++) {
byte_pieces_[i * 2] = (unsigned char)i;
byte_pieces_[i * 2 + 1] = '\0';
Expand Down Expand Up @@ -57,8 +57,8 @@ Error BPETokenizer::load(const std::string& tokenizer_path) {
ET_LOG(Error, "couldn't load %s", tokenizer_path.c_str());
return Error::InvalidArgument;
}
int32_t metadata[2];
for (int i = 0; i < 2; i++) {
int32_t metadata[4];
for (int i = 0; i < 4; i++) {
if (fread(metadata + i, sizeof(int32_t), 1, file) != 1) {
ET_LOG(
Error,
Expand All @@ -72,8 +72,9 @@ Error BPETokenizer::load(const std::string& tokenizer_path) {
// tokenizer file.
int32_t tokenizer_vocab_size = metadata[0];
vocab_size_ = tokenizer_vocab_size;

max_token_length_ = metadata[1];
bos_tok_ = metadata[1];
eos_tok_ = metadata[2];
max_token_length_ = metadata[3];

// allocate space for the vocabulary
vocab_ = std::make_unique<char*[]>(vocab_size_);
Expand Down
5 changes: 4 additions & 1 deletion examples/models/llama2/tokenizer/bpe_tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
namespace torch {
namespace executor {

constexpr int32_t kVocabSize = 32000;
// Default values for llama2
constexpr int32_t kDefaultVocabSize = 32000;
constexpr uint64_t kDefaultBosTokenId = 1;
constexpr uint64_t kDefaultEosTokenId = 2;

struct TokenIndex {
const char* str;
Expand Down
Binary file not shown.
4 changes: 2 additions & 2 deletions examples/models/llama2/tokenizer/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ def define_common_targets():
)

runtime.python_test(
name = "test_tokenizer_py",
name = "test_bpe_tokenizer_py",
srcs = [
"test_tokenizer.py",
"test_bpe_tokenizer.py",
],
visibility = [
"//executorch/examples/...",
Expand Down
11 changes: 6 additions & 5 deletions examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ class TokenizerExtensionTest : public Test {
void SetUp() override {
torch::executor::runtime_init();
tokenizer_ = std::make_unique<BPETokenizer>();
modelPath_ = std::getenv("RESOURCES_PATH") + std::string("/test.bin");
modelPath_ =
std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin");
}

std::unique_ptr<Tokenizer> tokenizer_;
Expand All @@ -47,13 +48,13 @@ TEST_F(TokenizerExtensionTest, DecodeOutOfRangeFails) {
EXPECT_EQ(result.error(), Error::NotSupported);
}

TEST_F(TokenizerExtensionTest, TokenizerVocabSizeIsExpected) {
TEST_F(TokenizerExtensionTest, TokenizerMetadataIsExpected) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0.
// test_bpe_tokenizer.bin has vocab_size 0, bos_id 0, eos_id 0 recorded.
EXPECT_EQ(tokenizer_->vocab_size(), 0);
EXPECT_EQ(tokenizer_->bos_tok(), 1);
EXPECT_EQ(tokenizer_->eos_tok(), 2);
EXPECT_EQ(tokenizer_->bos_tok(), 0);
EXPECT_EQ(tokenizer_->eos_tok(), 0);
}

} // namespace executor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ class TestTokenizer(unittest.TestCase):
def test_export(self, mock_sp):
# Set up the mock SentencePieceProcessor
mock_sp.return_value.vocab_size.return_value = 0
mock_sp.return_value.bos_id.return_value = 0
mock_sp.return_value.eos_id.return_value = 0
mock_sp.return_value.get_piece_size.return_value = 0
# Create a temporary file
with tempfile.NamedTemporaryFile(delete=True) as temp:
Expand All @@ -32,8 +34,12 @@ def test_export(self, mock_sp):
with open(output.name, "rb") as f:
data = f.read(16)
# Unpack the data as 4 integers
vocab_size, max_token_length = struct.unpack("II", data)
vocab_size, bos_id, eos_id, max_token_length = struct.unpack(
"IIII", data
)
# Check that the integers match the properties of the tokenizer
self.assertEqual(vocab_size, 0)
self.assertEqual(bos_id, 0)
self.assertEqual(eos_id, 0)
# Check that the max token length is correct
self.assertEqual(max_token_length, 0)
20 changes: 4 additions & 16 deletions examples/models/llama2/tokenizer/test/test_tiktoken.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class TiktokenExtensionTest : public Test {
void SetUp() override {
torch::executor::runtime_init();
tokenizer_ = std::make_unique<Tiktoken>();
modelPath_ =
std::getenv("RESOURCES_PATH") + std::string("/tokenizer.model");
modelPath_ = std::getenv("RESOURCES_PATH") +
std::string("/test_tiktoken_tokenizer.model");
}

std::unique_ptr<Tokenizer> tokenizer_;
Expand All @@ -35,8 +35,8 @@ class MultimodalTiktokenV5ExtensionTest : public Test {
void SetUp() override {
torch::executor::runtime_init();
tokenizer_ = std::make_unique<Tiktoken>(MULTIMODAL);
modelPath_ =
std::getenv("RESOURCES_PATH") + std::string("/tokenizer.model");
modelPath_ = std::getenv("RESOURCES_PATH") +
std::string("/test_tiktoken_tokenizer.model");
}

std::unique_ptr<Tokenizer> tokenizer_;
Expand All @@ -56,8 +56,6 @@ TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) {
TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
// passed in and add placeholder tokens.
EXPECT_EQ(tokenizer_->vocab_size(), 128256);
EXPECT_EQ(tokenizer_->bos_tok(), 128000);
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
Expand All @@ -66,8 +64,6 @@ TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
// passed in and add placeholder tokens.
EXPECT_EQ(tokenizer_->vocab_size(), 128256);
EXPECT_EQ(tokenizer_->bos_tok(), 128000);
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
Expand All @@ -76,8 +72,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
// passed in and add placeholder tokens.
Result<std::vector<uint64_t>> out = tokenizer_->encode("hello world", 1, 0);
EXPECT_EQ(out.error(), Error::Ok);
EXPECT_EQ(out.get().size(), 3);
Expand All @@ -89,8 +83,6 @@ TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
// passed in and add placeholder tokens.
Result<std::vector<uint64_t>> out = tokenizer_->encode(
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n<|image|>What do you think is going on in this snapshot?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAmidst a scenic garden backdrop, a man dressed in a suit with a distinct button on its lower portion stands prominently.<|eom_id|>",
0,
Expand All @@ -112,8 +104,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
// passed in and add placeholder tokens.
std::vector<std::string> expected = {"<|begin_of_text|>", "hello", " world"};
std::vector<uint64_t> tokens = {128000, 15339, 1917};
for (size_t i = 0; i < tokens.size(); i++) {
Expand All @@ -126,8 +116,6 @@ TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
// passed in and add placeholder tokens.
std::vector<std::string> expected = {
"<|begin_of_text|>",
"<|start_header_id|>",
Expand Down
12 changes: 9 additions & 3 deletions examples/models/llama2/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
The binary format is:
1. vocab size: int32
2. max token length: int32
3. score: float32, len of bytes: int32, token bytes: [byte] for each token
2. bos token id: int32
3. eos token id: int32
4. max token length: int32
5. score: float32, len of bytes: int32, token bytes: [byte] for each token
:param output_path: output path of the new binary.
:param prepend_padding: a boolean to control if we want to prepend a padding token.
Expand Down Expand Up @@ -99,7 +101,11 @@ def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
# write to a binary file
with open(output_path, "wb") as f:
# write the vocab size, bos/eos ids and max token length
f.write(struct.pack("II", self.n_words, max_token_length))
f.write(
struct.pack(
"IIII", self.n_words, self.bos_id, self.eos_id, max_token_length
)
)
for bytes, score in zip(tokens, scores):
f.write(struct.pack("fI", score, len(bytes)))
f.write(bytes)
Expand Down

0 comments on commit d948c1a

Please sign in to comment.