diff --git a/.gitignore b/.gitignore index a5e2deb..7f18f11 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,8 @@ *.a *.lib +build + # Executables *.exe *.out diff --git a/README.md b/README.md index a652ff8..01cdb7e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,84 @@ # tokenizers-cpp -Cross platform universal tokenizer binding to HF and sentencepiece +This project provides a cross-platform C++ tokenizer binding library that can be universally deployed. +It wraps and binds the [HuggingFace tokenizers library](https://github.com/huggingface/tokenizers) +and [sentencepiece](https://github.com/google/sentencepiece) and provides a minimum common interface in C++. + +The main goal of the project is to enable tokenizer deployment for language model applications +to native platforms with minimum dependencies and remove some of the barriers of +cross-language bindings. This project is developed in part with and +used in [MLC LLM](https://github.com/mlc-ai/mlc-llm). We have tested the following platforms: + +- iOS +- Android +- Windows +- Linux +- Web browser + +## Getting Started + +The easiest way is to add this project as a submodule and then +include it via `add_sub_directory` in your CMake project. +You also need to turn on `c++17` support. + +- First, you need to make sure you have rust installed. +- If you are cross-compiling make sure you install the necessary target in rust. + For example, run `rustup target add aarch64-apple-ios` to install iOS target. +- You can then link the libary + +See [example](example) folder for an example CMake project. + +### Example Code + +```c++ +// - dist/tokenizer.json +void HuggingFaceTokenizerExample() { + // Read blob from file. + auto blob = LoadBytesFromFile("dist/tokenizer.json"); + // Note: all the current factory APIs takes in-memory blob as input. + // This gives some flexibility on how these blobs can be read. + auto tok = Tokenizer::FromBlobJSON(blob); + std::string prompt = "What is the capital of Canada?"; + // call Encode to turn prompt into token ids + std::vector ids = tok->Encode(prompt); + // call Decode to turn ids into string + std::string decoded_prompt = tok->Decode(ids); +} + +void SentencePieceTokenizerExample() { + // Read blob from file. + auto blob = LoadBytesFromFile("dist/tokenizer.model"); + // Note: all the current factory APIs takes in-memory blob as input. + // This gives some flexibility on how these blobs can be read. + auto tok = Tokenizer::FromBlobSentencePiece(blob); + std::string prompt = "What is the capital of Canada?"; + // call Encode to turn prompt into token ids + std::vector ids = tok->Encode(prompt); + // call Decode to turn ids into string + std::string decoded_prompt = tok->Decode(ids); +} +``` + +### Extra Details + +Currently, the project generates three static libraries +- `libtokenizers_c.a`: the c binding to tokenizers rust library +- `libsentencepice.a`: sentencepiece static library +- `libtokenizers_cpp.a`: the cpp binding implementation + +If you are using an IDE, you can likely first use cmake to generate +these libraries and add them to your development environment. +If you are using cmake, `target_link_libraries(yourlib tokenizers_cpp)` +will automatically links in the other two libraries. +You can also checkout [MLC LLM](https://github.com/mlc-ai/mlc-llm) +for as an example of complete LLM chat application integrations. + +## Javascript Support + +We use emscripten to expose tokenizer-cpp to wasm and javascript. +Checkout [web](web) for more details. + +## Acknowledgements + +This project is only possible thanks to the shoulders open-source ecosystems that we stand on. +This project is based on sentencepiece and tokenizers library. diff --git a/example/.gitignore b/example/.gitignore new file mode 100644 index 0000000..9d0b71a --- /dev/null +++ b/example/.gitignore @@ -0,0 +1,2 @@ +build +dist diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt new file mode 100644 index 0000000..87a8f15 --- /dev/null +++ b/example/CMakeLists.txt @@ -0,0 +1,28 @@ + +# Example cmake project +cmake_minimum_required(VERSION 3.18) +project(tokenizers_cpp_example C CXX) + +include(CheckCXXCompilerFlag) +if(NOT MSVC) + check_cxx_compiler_flag("-std=c++17" SUPPORT_CXX17) + set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS}") + set(CMAKE_CUDA_STANDARD 17) +else() + check_cxx_compiler_flag("/std:c++17" SUPPORT_CXX17) + set(CMAKE_CXX_FLAGS "/std:c++17 ${CMAKE_CXX_FLAGS}") + set(CMAKE_CUDA_STANDARD 17) +endif() + +# include tokenizer cpp as a sub directory +set(TOKENZIER_CPP_PATH ..) +add_subdirectory(${TOKENZIER_CPP_PATH} tokenizers EXCLUDE_FROM_ALL) + +add_executable(example example.cc) + +target_include_directories(example PRIVATE ${TOKENZIER_CPP_PATH}/include) + +# You can link tokenizers_cpp, it will automatically link tokenizers_c +# and sentencepiece libary +target_link_libraries(example PRIVATE tokenizers_cpp) + diff --git a/example/README.md b/example/README.md new file mode 100644 index 0000000..2ab844b --- /dev/null +++ b/example/README.md @@ -0,0 +1,8 @@ +# Example Project + +This is an example cmake project to use tokenizers-cpp + + +```bash +./build_and_run.sh +``` diff --git a/example/build_and_run.sh b/example/build_and_run.sh new file mode 100755 index 0000000..84d19f8 --- /dev/null +++ b/example/build_and_run.sh @@ -0,0 +1,23 @@ +#/bin/bash + +# build +mkdir -p build +cd build +cmake .. +make -j8 +cd .. +# get example files + +mkdir -p dist +cd dist +if [ ! -f "tokenizer.model" ]; then + wget https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model +fi +if [ ! -f "tokenizer.json" ]; then + wget https://huggingface.co/togethercomputer/RedPajama-INCITE-Chat-3B-v1/resolve/main/tokenizer.json +fi +cd .. + +# run +echo "---Running example----" +./build/example diff --git a/example/example.cc b/example/example.cc new file mode 100644 index 0000000..87048df --- /dev/null +++ b/example/example.cc @@ -0,0 +1,76 @@ +#include + +#include +#include +#include + +using tokenizers::Tokenizer; + +std::string LoadBytesFromFile(const std::string& path) { + std::ifstream fs(path, std::ios::in | std::ios::binary); + if (fs.fail()) { + std::cerr << "Cannot open " << path << std::endl; + exit(1); + } + std::string data; + fs.seekg(0, std::ios::end); + size_t size = static_cast(fs.tellg()); + fs.seekg(0, std::ios::beg); + data.resize(size); + fs.read(data.data(), size); + return data; +} + +void PrintEncodeResult(const std::vector& ids) { + std::cout << "tokens=["; + for (size_t i = 0; i < ids.size(); ++i) { + if (i != 0) std::cout << ", "; + std::cout << ids[i]; + } + std::cout << "]" << std::endl; +} + +// Sentencepiece tokenizer +// - dist/tokenizer.model +void SentencePieceTokenizerExample() { + // Read blob from file. + auto blob = LoadBytesFromFile("dist/tokenizer.model"); + // Note: all the current factory APIs takes in-memory blob as input. + // This gives some flexibility on how these blobs can be read. + auto tok = Tokenizer::FromBlobSentencePiece(blob); + std::string prompt = "What is the capital of Canada?"; + // call Encode to turn prompt into token ids + std::vector ids = tok->Encode(prompt); + // call Decode to turn ids into string + std::string decoded_prompt = tok->Decode(ids); + + // print encoded result + std::cout << "SetencePiece tokenizer: " << std::endl; + PrintEncodeResult(ids); + std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl; +} + +// HF tokenizer +// - dist/tokenizer.json +void HuggingFaceTokenizerExample() { + // Read blob from file. + auto blob = LoadBytesFromFile("dist/tokenizer.json"); + // Note: all the current factory APIs takes in-memory blob as input. + // This gives some flexibility on how these blobs can be read. + auto tok = Tokenizer::FromBlobJSON(blob); + std::string prompt = "What is the capital of Canada?"; + // call Encode to turn prompt into token ids + std::vector ids = tok->Encode(prompt); + // call Decode to turn ids into string + std::string decoded_prompt = tok->Decode(ids); + + // print encoded result + std::cout << "HF tokenizer: " << std::endl; + PrintEncodeResult(ids); + std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl; +} + +int main(int argc, char* argv[]) { + SentencePieceTokenizerExample(); + HuggingFaceTokenizerExample(); +} diff --git a/src/tokenizers_c.h b/include/tokenizers_c.h similarity index 97% rename from src/tokenizers_c.h rename to include/tokenizers_c.h index aae6204..e1b77ab 100644 --- a/src/tokenizers_c.h +++ b/include/tokenizers_c.h @@ -37,4 +37,4 @@ void tokenizers_free(TokenizerHandle handle); #ifdef __cplusplus } #endif -#endif // TOKENIZERS_C_H_ \ No newline at end of file +#endif // TOKENIZERS_C_H_ diff --git a/src/huggingface_tokenizer.cc b/src/huggingface_tokenizer.cc index 705d3ba..82cb441 100644 --- a/src/huggingface_tokenizer.cc +++ b/src/huggingface_tokenizer.cc @@ -4,10 +4,9 @@ * \file huggingface_tokenizer.cc * \brief Huggingface tokenizer */ +#include #include -#include "tokenizers_c.h" - namespace tokenizers { /*! * \brief A simple c++ header of tokenizer via C API.