Move tokenizer into extension/llm/tokenizer (pytorch#4278)

Summary: This PR moves the titoken and bpe tokenizers into `extension/llm/tokenizer` such that they can be reused by other models. Note: Currently the tiktoken has two sets of unit tests based on llama2's tokenizer: - default - multimodal This PR only moves the default unit test into extension and keeps the multimodal's unit tests inside llama2/tokenizer. Pull Request resolved: pytorch#4278 Test Plan: - test/run_oss_cpp_tests.sh examples/models/llama2/tokenizer/test - test/run_oss_cpp_tests.sh extension/llm/tokenizer/test Reviewed By: larryliu0820 Differential Revision: D59822702 Pulled By: helunwencser fbshipit-source-id: 5d51ba3e44c9b2d9dc77b9f4349b58947ed68502
dijopaul · Jul 17, 2024 · 0cde6b8 · 0cde6b8
1 parent 740a0a5
commit 0cde6b8
Show file tree

Hide file tree

Showing 31 changed files with 128,324 additions and 180 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -55,15 +55,15 @@
 [submodule "examples/third-party/LLaVA"]
 	path = examples/third-party/LLaVA
 	url = https://github.com/haotian-liu/LLaVA.git
-[submodule "examples/models/llama2/third-party/re2"]
-	path = examples/models/llama2/third-party/re2
-	url = https://github.com/google/re2.git
-[submodule "examples/models/llama2/third-party/abseil-cpp"]
-	path = examples/models/llama2/third-party/abseil-cpp
-	url = https://github.com/abseil/abseil-cpp.git
 [submodule "third-party/ios-cmake"]
 	path = third-party/ios-cmake
 	url = https://github.com/leetal/ios-cmake
 [submodule "examples/models/phi-3-mini/third-party/sentencepiece"]
 	path = examples/models/phi-3-mini/third-party/sentencepiece
 	url = https://github.com/google/sentencepiece.git
+[submodule "extension/llm/third-party/re2"]
+	path = extension/llm/third-party/re2
+	url = https://github.com/google/re2.git
+[submodule "extension/llm/third-party/abseil-cpp"]
+	path = extension/llm/third-party/abseil-cpp
+	url = https://github.com/abseil/abseil-cpp.git
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
@@ -97,8 +97,14 @@ if(EXECUTORCH_USE_TIKTOKEN)
   set(ABSL_PROPAGATE_CXX_STD ON)
   set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/abseil-cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
+  )
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/re2
+    ${CMAKE_CURRENT_BINARY_DIR}/re2
+  )
   set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
   target_link_libraries(llama_runner PUBLIC re2::re2)
 endif()

diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt
@@ -43,7 +43,7 @@ target_include_directories(
 
 if(EXECUTORCH_USE_TIKTOKEN)
   list(APPEND _llama_runner__srcs
-       ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/tiktoken.cpp
+       ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
   )
   list(APPEND _llama_runner__srcs
        ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp

diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -13,7 +13,7 @@
 #if ET_USE_TIKTOKEN
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
 #else /* BPE */
-#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #endif /* ET_USE_TIKTOKEN*/
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/runner_util/managed_tensor.h>

diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
@@ -18,7 +18,7 @@
 #include <unordered_map>
 
 #include <executorch/examples/models/llama2/sampler/sampler.h>
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 

diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
@@ -43,7 +43,7 @@ def define_common_targets():
             ] + ([
                 "//executorch/examples/models/llama2/tokenizer:tiktoken",
             ] if use_tiktoken() else [
-                "//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
+                "//executorch/extension/llm/tokenizer:bpe_tokenizer",
             ]) + (_get_operator_lib(aten)) + ([
                 # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
                 # Therefore enable it explicitly for now to avoid failing tests

diff --git a/examples/models/llama2/third-party/abseil-cpp b/examples/models/llama2/third-party/abseil-cpp
diff --git a/examples/models/llama2/third-party/re2 b/examples/models/llama2/third-party/re2
diff --git a/examples/models/llama2/tokenizer/llama_tiktoken.h b/examples/models/llama2/tokenizer/llama_tiktoken.h
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
+#include <executorch/extension/llm/tokenizer/tiktoken.h>
 
 namespace torch {
 namespace executor {

diff --git a/examples/models/llama2/tokenizer/targets.bzl b/examples/models/llama2/tokenizer/targets.bzl
@@ -1,44 +1,23 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets():
-    runtime.cxx_library(
-        name = "bpe_tokenizer",
-        srcs = [
-            "bpe_tokenizer.cpp",
-        ],
-        exported_headers = [
-            "tokenizer.h",
-            "bpe_tokenizer.h",
-        ],
-        exported_deps = [
-            "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
-        ],
-        visibility = [
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+    """Defines targets that should be shared between fbcode and xplat.
 
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
     runtime.cxx_library(
         name = "tiktoken",
         srcs = [
-            "tiktoken.cpp",
             "llama_tiktoken.cpp",
         ],
         exported_headers = [
-            "tokenizer.h",
-            "tiktoken.h",
             "llama_tiktoken.h",
-            "base64.h",
         ],
         exported_deps = [
-            "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+            "//executorch/extension/llm/tokenizer:tiktoken",
         ],
         visibility = [
             "@EXECUTORCH_CLIENTS",
         ],
-        exported_external_deps = [
-            "re2",
-        ],
     )
diff --git a/examples/models/llama2/tokenizer/test/CMakeLists.txt b/examples/models/llama2/tokenizer/test/CMakeLists.txt
@@ -21,31 +21,31 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
 
 include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
-set(
-  _tokenizer_test_srcs
-  test_tiktoken.cpp
-  test_bpe_tokenizer.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/../tiktoken.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/../bpe_tokenizer.cpp
+set(_tokenizer_test_srcs
+    test_tiktoken.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizer/tiktoken.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
 )
 
 set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/resources)
 set(ABSL_ENABLE_INSTALL ON)
 set(ABSL_PROPAGATE_CXX_STD ON)
 set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/re2 ${CMAKE_CURRENT_BINARY_DIR}/re2)
+add_subdirectory(
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
+  ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
+)
+add_subdirectory(
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/re2
+  ${CMAKE_CURRENT_BINARY_DIR}/re2
+)
 set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
 
-et_cxx_test(
-  tokenizer_test
-  SOURCES
-  ${_tokenizer_test_srcs}
-  EXTRA_LIBS
-  re2::re2
-)
+et_cxx_test(tokenizer_test SOURCES ${_tokenizer_test_srcs} EXTRA_LIBS re2::re2)
 target_include_directories(
-  tokenizer_test PRIVATE ${CMAKE_INSTALL_PREFIX}/include ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
+  tokenizer_test
+  PRIVATE
+    ${CMAKE_INSTALL_PREFIX}/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
 )
diff --git a/examples/models/llama2/tokenizer/test/targets.bzl b/examples/models/llama2/tokenizer/test/targets.bzl
@@ -6,20 +6,6 @@ def define_common_targets():
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
-
-    runtime.cxx_test(
-        name = "test_bpe_tokenizer",
-        srcs = [
-            "test_bpe_tokenizer.cpp",
-        ],
-        deps = [
-            "//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
-        ],
-        env = {
-            "RESOURCES_PATH": "$(location :resources)/resources",
-        },
-    )
-
     runtime.cxx_test(
         name = "test_tiktoken",
         srcs = [
@@ -31,9 +17,6 @@ def define_common_targets():
         env = {
             "RESOURCES_PATH": "$(location :resources)/resources",
         },
-        external_deps = [
-            "re2",
-        ],
     )
 
     runtime.filegroup(

diff --git a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp b/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
@@ -7,7 +7,6 @@
  */
 
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <gtest/gtest.h>
 #include <vector>
@@ -17,19 +16,6 @@ using namespace ::testing;
 namespace torch {
 namespace executor {
 
-class TiktokenExtensionTest : public Test {
- public:
-  void SetUp() override {
-    torch::executor::runtime_init();
-    tokenizer_ = get_tiktoken_for_llama();
-    modelPath_ = std::getenv("RESOURCES_PATH") +
-        std::string("/test_tiktoken_tokenizer.model");
-  }
-
-  std::unique_ptr<Tokenizer> tokenizer_;
-  std::string modelPath_;
-};
-
 class MultimodalTiktokenV5ExtensionTest : public Test {
  public:
   void SetUp() override {
@@ -43,24 +29,6 @@ class MultimodalTiktokenV5ExtensionTest : public Test {
   std::string modelPath_;
 };
 
-TEST_F(TiktokenExtensionTest, EncodeWithoutLoadFails) {
-  Result<std::vector<uint64_t>> res = tokenizer_->encode("hello world", 0, 0);
-  EXPECT_EQ(res.error(), Error::NotSupported);
-}
-
-TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) {
-  auto result = tokenizer_->decode(0, 0);
-  EXPECT_EQ(result.error(), Error::NotSupported);
-}
-
-TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
-  Error res = tokenizer_->load(modelPath_.c_str());
-  EXPECT_EQ(res, Error::Ok);
-  EXPECT_EQ(tokenizer_->vocab_size(), 128256);
-  EXPECT_EQ(tokenizer_->bos_tok(), 128000);
-  EXPECT_EQ(tokenizer_->eos_tok(), 128001);
-}
-
 TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
@@ -69,17 +37,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
   EXPECT_EQ(tokenizer_->eos_tok(), 128001);
 }
 
-TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
-  Error res = tokenizer_->load(modelPath_.c_str());
-  EXPECT_EQ(res, Error::Ok);
-  Result<std::vector<uint64_t>> out = tokenizer_->encode("hello world", 1, 0);
-  EXPECT_EQ(out.error(), Error::Ok);
-  EXPECT_EQ(out.get().size(), 3);
-  EXPECT_EQ(out.get()[0], 128000);
-  EXPECT_EQ(out.get()[1], 15339);
-  EXPECT_EQ(out.get()[2], 1917);
-}
-
 TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
@@ -101,18 +58,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
   }
 }
 
-TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
-  Error res = tokenizer_->load(modelPath_.c_str());
-  EXPECT_EQ(res, Error::Ok);
-  std::vector<std::string> expected = {"<|begin_of_text|>", "hello", " world"};
-  std::vector<uint64_t> tokens = {128000, 15339, 1917};
-  for (size_t i = 0; i < tokens.size(); i++) {
-    Result<std::string> out = tokenizer_->decode(0, tokens[i]);
-    EXPECT_EQ(out.error(), Error::Ok);
-    EXPECT_EQ(out.get(), expected[i]);
-  }
-}
-
 TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
@@ -134,44 +79,5 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
     EXPECT_EQ(out.get(), expected[i]);
   }
 }
-
-TEST_F(TiktokenExtensionTest, TokenizerDecodeOutOfRangeFails) {
-  Error res = tokenizer_->load(modelPath_.c_str());
-  EXPECT_EQ(res, Error::Ok);
-  // The vocab size is 128256, addes 256 just so the token is out of vocab
-  // range.
-  Result<std::string> out = tokenizer_->decode(0, 128256 + 256);
-  EXPECT_EQ(out.error(), Error::NotSupported);
-}
-
-TEST_F(TiktokenExtensionTest, ConstructionWithInvalidBOSIndex) {
-  // gtest death test doesn't work on iOS:
-  // https://github.com/google/googletest/issues/2834
-#if !GTEST_OS_IOS
-  EXPECT_EXIT(
-      std::make_unique<Tiktoken>(
-          std::make_unique<std::vector<std::string>>(
-              std::vector<std::string>{"<|end_of_text|>"}),
-          1,
-          0),
-      ::testing::KilledBySignal(SIGABRT),
-      "");
-#endif
-}
-
-TEST_F(TiktokenExtensionTest, ConstructionWithInvalidEOSIndex) {
-  // gtest death test doesn't work on iOS:
-  // https://github.com/google/googletest/issues/2834
-#if !GTEST_OS_IOS
-  EXPECT_EXIT(
-      std::make_unique<Tiktoken>(
-          std::make_unique<std::vector<std::string>>(
-              std::vector<std::string>{"<|begin_of_text|>"}),
-          0,
-          1),
-      ::testing::KilledBySignal(SIGABRT),
-      "");
-#endif
-}
 } // namespace executor
 } // namespace torch
diff --git a/examples/qualcomm/llama2/runner/runner.cpp b/examples/qualcomm/llama2/runner/runner.cpp
@@ -9,9 +9,9 @@
 // A simple llama2 runner that includes preprocessing and post processing logic.
 // The module takes in a string as input and emits a string as output.
 
-#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 #include <executorch/examples/qualcomm/llama2/runner/runner.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
 #include <ctime>

diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
@@ -128,11 +128,11 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
     set(CMAKE_POSITION_INDEPENDENT_CODE ON)
     add_subdirectory(
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../examples/models/llama2/third-party/abseil-cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp
       ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
     )
     add_subdirectory(
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../examples/models/llama2/third-party/re2
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2
       ${CMAKE_CURRENT_BINARY_DIR}/re2
     )
     set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

diff --git a/extension/llm/third-party/abseil-cpp b/extension/llm/third-party/abseil-cpp
diff --git a/extension/llm/third-party/re2 b/extension/llm/third-party/re2
diff --git a/examples/models/llama2/tokenizer/base64.h → extension/llm/tokenizer/base64.h b/examples/models/llama2/tokenizer/base64.h → extension/llm/tokenizer/base64.h
diff --git a/...models/llama2/tokenizer/bpe_tokenizer.cpp → extension/llm/tokenizer/bpe_tokenizer.cpp b/...models/llama2/tokenizer/bpe_tokenizer.cpp → extension/llm/tokenizer/bpe_tokenizer.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 
 #include <string>
 

diff --git a/...s/models/llama2/tokenizer/bpe_tokenizer.h → extension/llm/tokenizer/bpe_tokenizer.h b/...s/models/llama2/tokenizer/bpe_tokenizer.h → extension/llm/tokenizer/bpe_tokenizer.h
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <cstdint>
 
 namespace torch {