Skip to content

Commit

Permalink
Move tokenizer into extension/llm/tokenizer (pytorch#4278)
Browse files Browse the repository at this point in the history
Summary:
This PR moves the titoken and bpe tokenizers into `extension/llm/tokenizer` such that they can be reused by other models.

Note: Currently the tiktoken has two sets of unit tests based on llama2's tokenizer:
- default
- multimodal

This PR only moves the default unit test into extension and keeps the multimodal's unit tests inside llama2/tokenizer.

Pull Request resolved: pytorch#4278

Test Plan:
- test/run_oss_cpp_tests.sh examples/models/llama2/tokenizer/test
- test/run_oss_cpp_tests.sh extension/llm/tokenizer/test

Reviewed By: larryliu0820

Differential Revision: D59822702

Pulled By: helunwencser

fbshipit-source-id: 5d51ba3e44c9b2d9dc77b9f4349b58947ed68502
  • Loading branch information
helunwencser authored and facebook-github-bot committed Jul 17, 2024
1 parent 740a0a5 commit 0cde6b8
Show file tree
Hide file tree
Showing 31 changed files with 128,324 additions and 180 deletions.
12 changes: 6 additions & 6 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@
[submodule "examples/third-party/LLaVA"]
path = examples/third-party/LLaVA
url = https://github.com/haotian-liu/LLaVA.git
[submodule "examples/models/llama2/third-party/re2"]
path = examples/models/llama2/third-party/re2
url = https://github.com/google/re2.git
[submodule "examples/models/llama2/third-party/abseil-cpp"]
path = examples/models/llama2/third-party/abseil-cpp
url = https://github.com/abseil/abseil-cpp.git
[submodule "third-party/ios-cmake"]
path = third-party/ios-cmake
url = https://github.com/leetal/ios-cmake
[submodule "examples/models/phi-3-mini/third-party/sentencepiece"]
path = examples/models/phi-3-mini/third-party/sentencepiece
url = https://github.com/google/sentencepiece.git
[submodule "extension/llm/third-party/re2"]
path = extension/llm/third-party/re2
url = https://github.com/google/re2.git
[submodule "extension/llm/third-party/abseil-cpp"]
path = extension/llm/third-party/abseil-cpp
url = https://github.com/abseil/abseil-cpp.git
10 changes: 8 additions & 2 deletions examples/models/llama2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,14 @@ if(EXECUTORCH_USE_TIKTOKEN)
set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/abseil-cpp
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/re2
${CMAKE_CURRENT_BINARY_DIR}/re2
)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
target_link_libraries(llama_runner PUBLIC re2::re2)
endif()
Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama2/runner/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ target_include_directories(

if(EXECUTORCH_USE_TIKTOKEN)
list(APPEND _llama_runner__srcs
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
)
list(APPEND _llama_runner__srcs
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp
Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama2/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#if ET_USE_TIKTOKEN
#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
#else /* BPE */
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
#endif /* ET_USE_TIKTOKEN*/
#include <executorch/extension/evalue_util/print_evalue.h>
#include <executorch/extension/runner_util/managed_tensor.h>
Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama2/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include <unordered_map>

#include <executorch/examples/models/llama2/sampler/sampler.h>
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <executorch/extension/llm/tokenizer/tokenizer.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/runner_util/managed_tensor.h>

Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama2/runner/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def define_common_targets():
] + ([
"//executorch/examples/models/llama2/tokenizer:tiktoken",
] if use_tiktoken() else [
"//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
"//executorch/extension/llm/tokenizer:bpe_tokenizer",
]) + (_get_operator_lib(aten)) + ([
# Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
# Therefore enable it explicitly for now to avoid failing tests
Expand Down
1 change: 0 additions & 1 deletion examples/models/llama2/third-party/abseil-cpp
Submodule abseil-cpp deleted from 854193
1 change: 0 additions & 1 deletion examples/models/llama2/third-party/re2
Submodule re2 deleted from ac82d4
2 changes: 1 addition & 1 deletion examples/models/llama2/tokenizer/llama_tiktoken.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#pragma once

#include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
#include <executorch/extension/llm/tokenizer/tiktoken.h>

namespace torch {
namespace executor {
Expand Down
31 changes: 5 additions & 26 deletions examples/models/llama2/tokenizer/targets.bzl
Original file line number Diff line number Diff line change
@@ -1,44 +1,23 @@
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")

def define_common_targets():
runtime.cxx_library(
name = "bpe_tokenizer",
srcs = [
"bpe_tokenizer.cpp",
],
exported_headers = [
"tokenizer.h",
"bpe_tokenizer.h",
],
exported_deps = [
"//executorch/runtime/core/exec_aten:lib",
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
],
visibility = [
"@EXECUTORCH_CLIENTS",
],
)
"""Defines targets that should be shared between fbcode and xplat.
The directory containing this targets.bzl file should also contain both
TARGETS and BUCK files that call this function.
"""
runtime.cxx_library(
name = "tiktoken",
srcs = [
"tiktoken.cpp",
"llama_tiktoken.cpp",
],
exported_headers = [
"tokenizer.h",
"tiktoken.h",
"llama_tiktoken.h",
"base64.h",
],
exported_deps = [
"//executorch/runtime/core/exec_aten:lib",
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
"//executorch/extension/llm/tokenizer:tiktoken",
],
visibility = [
"@EXECUTORCH_CLIENTS",
],
exported_external_deps = [
"re2",
],
)
34 changes: 17 additions & 17 deletions examples/models/llama2/tokenizer/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,31 +21,31 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)

include(${EXECUTORCH_ROOT}/build/Test.cmake)

set(
_tokenizer_test_srcs
test_tiktoken.cpp
test_bpe_tokenizer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../bpe_tokenizer.cpp
set(_tokenizer_test_srcs
test_tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizer/tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
)

set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/resources)
set(ABSL_ENABLE_INSTALL ON)
set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/re2 ${CMAKE_CURRENT_BINARY_DIR}/re2)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/re2
${CMAKE_CURRENT_BINARY_DIR}/re2
)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

et_cxx_test(
tokenizer_test
SOURCES
${_tokenizer_test_srcs}
EXTRA_LIBS
re2::re2
)
et_cxx_test(tokenizer_test SOURCES ${_tokenizer_test_srcs} EXTRA_LIBS re2::re2)
target_include_directories(
tokenizer_test PRIVATE ${CMAKE_INSTALL_PREFIX}/include ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
tokenizer_test
PRIVATE
${CMAKE_INSTALL_PREFIX}/include
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
)
17 changes: 0 additions & 17 deletions examples/models/llama2/tokenizer/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,6 @@ def define_common_targets():
The directory containing this targets.bzl file should also contain both
TARGETS and BUCK files that call this function.
"""

runtime.cxx_test(
name = "test_bpe_tokenizer",
srcs = [
"test_bpe_tokenizer.cpp",
],
deps = [
"//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
],
env = {
"RESOURCES_PATH": "$(location :resources)/resources",
},
)

runtime.cxx_test(
name = "test_tiktoken",
srcs = [
Expand All @@ -31,9 +17,6 @@ def define_common_targets():
env = {
"RESOURCES_PATH": "$(location :resources)/resources",
},
external_deps = [
"re2",
],
)

runtime.filegroup(
Expand Down
94 changes: 0 additions & 94 deletions examples/models/llama2/tokenizer/test/test_tiktoken.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
*/

#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <executorch/runtime/platform/runtime.h>
#include <gtest/gtest.h>
#include <vector>
Expand All @@ -17,19 +16,6 @@ using namespace ::testing;
namespace torch {
namespace executor {

class TiktokenExtensionTest : public Test {
public:
void SetUp() override {
torch::executor::runtime_init();
tokenizer_ = get_tiktoken_for_llama();
modelPath_ = std::getenv("RESOURCES_PATH") +
std::string("/test_tiktoken_tokenizer.model");
}

std::unique_ptr<Tokenizer> tokenizer_;
std::string modelPath_;
};

class MultimodalTiktokenV5ExtensionTest : public Test {
public:
void SetUp() override {
Expand All @@ -43,24 +29,6 @@ class MultimodalTiktokenV5ExtensionTest : public Test {
std::string modelPath_;
};

TEST_F(TiktokenExtensionTest, EncodeWithoutLoadFails) {
Result<std::vector<uint64_t>> res = tokenizer_->encode("hello world", 0, 0);
EXPECT_EQ(res.error(), Error::NotSupported);
}

TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) {
auto result = tokenizer_->decode(0, 0);
EXPECT_EQ(result.error(), Error::NotSupported);
}

TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
EXPECT_EQ(tokenizer_->vocab_size(), 128256);
EXPECT_EQ(tokenizer_->bos_tok(), 128000);
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
}

TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
Expand All @@ -69,17 +37,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
}

TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
Result<std::vector<uint64_t>> out = tokenizer_->encode("hello world", 1, 0);
EXPECT_EQ(out.error(), Error::Ok);
EXPECT_EQ(out.get().size(), 3);
EXPECT_EQ(out.get()[0], 128000);
EXPECT_EQ(out.get()[1], 15339);
EXPECT_EQ(out.get()[2], 1917);
}

TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
Expand All @@ -101,18 +58,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
}
}

TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
std::vector<std::string> expected = {"<|begin_of_text|>", "hello", " world"};
std::vector<uint64_t> tokens = {128000, 15339, 1917};
for (size_t i = 0; i < tokens.size(); i++) {
Result<std::string> out = tokenizer_->decode(0, tokens[i]);
EXPECT_EQ(out.error(), Error::Ok);
EXPECT_EQ(out.get(), expected[i]);
}
}

TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
Expand All @@ -134,44 +79,5 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
EXPECT_EQ(out.get(), expected[i]);
}
}

TEST_F(TiktokenExtensionTest, TokenizerDecodeOutOfRangeFails) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// The vocab size is 128256, addes 256 just so the token is out of vocab
// range.
Result<std::string> out = tokenizer_->decode(0, 128256 + 256);
EXPECT_EQ(out.error(), Error::NotSupported);
}

TEST_F(TiktokenExtensionTest, ConstructionWithInvalidBOSIndex) {
// gtest death test doesn't work on iOS:
// https://github.com/google/googletest/issues/2834
#if !GTEST_OS_IOS
EXPECT_EXIT(
std::make_unique<Tiktoken>(
std::make_unique<std::vector<std::string>>(
std::vector<std::string>{"<|end_of_text|>"}),
1,
0),
::testing::KilledBySignal(SIGABRT),
"");
#endif
}

TEST_F(TiktokenExtensionTest, ConstructionWithInvalidEOSIndex) {
// gtest death test doesn't work on iOS:
// https://github.com/google/googletest/issues/2834
#if !GTEST_OS_IOS
EXPECT_EXIT(
std::make_unique<Tiktoken>(
std::make_unique<std::vector<std::string>>(
std::vector<std::string>{"<|begin_of_text|>"}),
0,
1),
::testing::KilledBySignal(SIGABRT),
"");
#endif
}
} // namespace executor
} // namespace torch
2 changes: 1 addition & 1 deletion examples/qualcomm/llama2/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
// A simple llama2 runner that includes preprocessing and post processing logic.
// The module takes in a string as input and emits a string as output.

#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
#include <executorch/examples/qualcomm/llama2/runner/runner.h>
#include <executorch/extension/evalue_util/print_evalue.h>
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
#include <executorch/extension/runner_util/managed_tensor.h>

#include <ctime>
Expand Down
4 changes: 2 additions & 2 deletions extension/android/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,11 +128,11 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../examples/models/llama2/third-party/abseil-cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../examples/models/llama2/third-party/re2
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2
${CMAKE_CURRENT_BINARY_DIR}/re2
)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
Expand Down
1 change: 1 addition & 0 deletions extension/llm/third-party/abseil-cpp
Submodule abseil-cpp added at eb8522
1 change: 1 addition & 0 deletions extension/llm/third-party/re2
Submodule re2 added at 6dcd83
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>

#include <string>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#pragma once

#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <executorch/extension/llm/tokenizer/tokenizer.h>
#include <cstdint>

namespace torch {
Expand Down
Loading

0 comments on commit 0cde6b8

Please sign in to comment.