nomic-ai · apage43 · May 21, 2023
diff --git a/.codespellrc b/.codespellrc
@@ -1,4 +1,4 @@
 [codespell]
-skip = .git,*.pdf,*.svg
+skip = .git,*.pdf,*.svg,*_tokenizer_config.h
 #
 # ignore-words-list =
diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt
@@ -23,6 +23,7 @@ set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL
 project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
 
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_STANDARD 17)
 
 set(LLAMA_BUILD_EXAMPLES ON CACHE BOOL "llama: build examples" FORCE)
 set(BUILD_SHARED_LIBS ON FORCE)
@@ -34,19 +35,22 @@ if (GPT4ALL_AVX_ONLY)
     set(LLAMA_FMA  OFF CACHE BOOL "llama: enable FMA" FORCE)
 endif()
 
+find_package(ICU REQUIRED COMPONENTS uc i18n)
 add_subdirectory(llama.cpp)
 
 add_library(llmodel
     gptj.h gptj.cpp
     llamamodel.h llamamodel.cpp
     llama.cpp/examples/common.cpp
     llmodel.h llmodel_c.h llmodel_c.cpp
-    mpt.h mpt.cpp
+    mpt.h mpt.cpp tokenizer/bpe.cpp tokenizer/bpe.h
+    tokenizer/mpt_tokenizer_config.h tokenizer/gptj_tokenizer_config.h
     utils.h utils.cpp
 )
 
 target_link_libraries(llmodel
-    PRIVATE llama)
+    PRIVATE llama
+    PUBLIC ICU::uc ICU::i18n)
 
 set_target_properties(llmodel PROPERTIES
                               VERSION ${PROJECT_VERSION}

diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <filesystem>
 #include <fstream>
 #include <map>
 #include <string>
@@ -860,6 +861,8 @@ bool GPTJ::loadModel(const std::string &modelPath) {
     d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
     d_ptr->modelLoaded = true;
     fflush(stdout);
+
+    get_bpecpp_tokenizer(TokenizerType::GPTJ, m_bpe, m_tokav);
     return true;
 }
 
@@ -915,7 +918,7 @@ void GPTJ::prompt(const std::string &prompt,
     int64_t t_prompt_us = 0;
 
     // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(d_ptr->vocab, prompt);
+    std::vector<uint32_t> embd_inp = m_tokav->encode(prompt, *m_bpe);
 
     // save the context size
     promptCtx.n_ctx = d_ptr->model->hparams.n_ctx;
@@ -1032,7 +1035,7 @@ void GPTJ::prompt(const std::string &prompt,
         if (id == 50256 /*end of text*/)
             goto stop_generating;
 
-        const std::string str = d_ptr->vocab.id_to_token[id];
+        const std::string str = m_tokav->decode({(uint32_t) id}, *m_bpe, true, false);
 
         // Check if the provided str is part of our reverse prompts
         bool foundPartialReversePrompt = false;
@@ -1062,7 +1065,8 @@ void GPTJ::prompt(const std::string &prompt,
             if (promptCtx.tokens.size() == promptCtx.n_ctx)
                 promptCtx.tokens.erase(promptCtx.tokens.begin());
             promptCtx.tokens.push_back(t);
-            if (!responseCallback(t, d_ptr->vocab.id_to_token[t]))
+            const std::string decoded = m_tokav->decode({(uint32_t) t}, *m_bpe, true, false);
+            if (!responseCallback(t, decoded))
                 goto stop_generating;
         }
         cachedTokens.clear();

diff --git a/gpt4all-backend/gptj.h b/gpt4all-backend/gptj.h
@@ -5,6 +5,7 @@
 #include <functional>
 #include <vector>
 #include "llmodel.h"
+#include "tokenizer/bpe.h"
 
 class GPTJPrivate;
 class GPTJ : public LLModel {
@@ -31,6 +32,8 @@ class GPTJ : public LLModel {
 
 private:
     GPTJPrivate *d_ptr;
+    std::unique_ptr<bpecpp::AdditionalVocabAdapter> m_tokav;
+    std::unique_ptr<bpecpp::BPE> m_bpe;
 };
 
 #endif // GPTJ_H
diff --git a/gpt4all-backend/mpt.cpp b/gpt4all-backend/mpt.cpp
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <filesystem>
 #include <fstream>
 #include <map>
 #include <random>
@@ -785,6 +786,12 @@ bool MPT::loadModel(const std::string &modelPath) {
     d_ptr->modelLoaded = true;
     d_ptr->has_im_end = d_ptr->vocab.token_to_id.find("<|im_end|>") != d_ptr->vocab.token_to_id.end();
     fflush(stdout);
+
+    if (modelPath.find("-chat") != std::string::npos) {
+        get_bpecpp_tokenizer(TokenizerType::MPT_CHAT, m_bpe, m_tokav);
+    } else {
+        get_bpecpp_tokenizer(TokenizerType::MPT, m_bpe, m_tokav);
+    }
     return true;
 }
 
@@ -840,7 +847,7 @@ void MPT::prompt(const std::string &prompt,
     int64_t t_prompt_us = 0;
 
     // tokenize the prompt
-    std::vector<int> embd_inp = gpt_tokenize(d_ptr->vocab, prompt);
+    std::vector<uint32_t> embd_inp = m_tokav->encode(prompt, *m_bpe);
 
     // save the context size
     promptCtx.n_ctx = d_ptr->model->hparams.n_ctx;
@@ -906,6 +913,7 @@ void MPT::prompt(const std::string &prompt,
     int r_instructFound = 0;
 
     std::string cachedResponse;
+    std::string decodeBuffer;
     std::vector<int> cachedTokens;
     std::unordered_set<std::string> reversePrompts
         = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };
@@ -961,7 +969,7 @@ void MPT::prompt(const std::string &prompt,
         if (id == 0 /*end of text*/)
             goto stop_generating;
 
-        const std::string str = d_ptr->vocab.id_to_token[id];
+        const std::string str = m_tokav->decode({(uint32_t) id}, *m_bpe, true, false);
 
         // Check if the provided str is part of our reverse prompts
         bool foundPartialReversePrompt = false;
@@ -991,7 +999,8 @@ void MPT::prompt(const std::string &prompt,
             if (promptCtx.tokens.size() == promptCtx.n_ctx)
                 promptCtx.tokens.erase(promptCtx.tokens.begin());
             promptCtx.tokens.push_back(t);
-            if (!responseCallback(t, d_ptr->vocab.id_to_token[t]))
+            const std::string decoded = m_tokav->decode({(uint32_t) t}, *m_bpe, true, false);
+            if (!responseCallback(t, decoded))
                 goto stop_generating;
         }
         cachedTokens.clear();

diff --git a/gpt4all-backend/mpt.h b/gpt4all-backend/mpt.h
@@ -5,6 +5,7 @@
 #include <functional>
 #include <vector>
 #include "llmodel.h"
+#include "tokenizer/bpe.h"
 
 class MPTPrivate;
 class MPT : public LLModel {
@@ -31,6 +32,8 @@ class MPT : public LLModel {
 
 private:
     MPTPrivate *d_ptr;
+    std::unique_ptr<bpecpp::AdditionalVocabAdapter> m_tokav;
+    std::unique_ptr<bpecpp::BPE> m_bpe;
 };
 
 #endif // MPT_H
diff --git a/gpt4all-backend/scripts/gen_tokenizer_include.py b/gpt4all-backend/scripts/gen_tokenizer_include.py
@@ -0,0 +1,136 @@
+import sys
+import json
+from dataclasses import dataclass
+
+def iter_with_last(lst):
+    llen = len(lst)
+    for i, entry in enumerate(lst):
+        last = i == (llen - 1)
+        yield last, entry
+
+@dataclass
+class BufSlice:
+    offset: int
+    length: int
+    def __repr__(self):
+        return '{'f'0x{self.offset:x},{self.length}''}'
+
+def c_str_dump(bs):
+    s = bytearray()
+    s += b'"'
+    llen = 0
+    lasthex = False
+    for byte in bs:
+        if byte in (b' 01234567890abcdefghijklmnopqrstuvwxyz_-=/;:<>'
+                    b'ABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*(),.[]{}`~|'):
+            # need to avoid hex characters not part of a hex escape
+            # appearing directly after a hex scape
+            if lasthex and byte in b'0123456789abcdefABCDEF':
+                s += b'""'
+                llen += 2
+            s += bytes([byte])
+            llen += 1
+            lasthex = False
+        else:
+            s += f'\\x{byte:02x}'.encode('utf8')
+            llen += 4
+            lasthex = True
+        if llen >= 80:
+            llen = 0
+            s += b"\"\n\""
+    s += b'"'
+    return s.decode('utf8')
+
+class Buf:
+    def __init__(self):
+        self.buf = b''
+        self.cache = {}
+
+    def get(self, s):
+        if s in self.cache:
+            return self.cache[s]
+        offset = len(self.buf)
+        bs = s.encode('utf8')
+        exoffs = self.buf.find(bs)
+        if exoffs != -1:
+            slc = BufSlice(offset=exoffs, length=len(bs))
+            self.cache[s] = slc
+            return slc
+        return None
+
+    def insert(self, s):
+        slc = self.get(s)
+        if slc is None:
+            bs = s.encode('utf8')
+            offset = len(self.buf)
+            self.buf += bs
+            slc = BufSlice(offset=offset, length=len(bs))
+        return slc
+
+class BreakEvery:
+    def __init__(self, n):
+        self.counter = 0
+        self.n = n
+
+    def __repr__(self):
+        self.counter += 1
+        self.counter %= self.n
+        if self.counter == 0:
+            return '\n'
+        return ''
+
+def do_convert(tkfilename, prefix):
+    with open(tkfilename, 'rb') as tkf:
+        tokconfig = json.load(tkf)
+
+    # every string in the vocab also appears in the merges list so we can store
+    # much less data in the binary by deduplicating these references, sorting by
+    # length descending makes it more likely prefixes of longer strings get
+    # deduped, and secondarily sorting lexicographically them makes the buffer
+    # data more compressible (they are not compressed in the binary itself, but
+    # the binary will be more compressible)
+    split_merges = [s.split(' ') for s in tokconfig['model']['merges']]
+    len_then = lambda m: (len(m),m)
+    avwords = sorted((av['content'] for av in tokconfig['added_tokens']), key=len_then, reverse=True)
+    all_strs = avwords + sorted(list(tokconfig['model']['vocab'].keys()), key=len_then, reverse=True)
+    buf = Buf()
+    for s in all_strs:
+        buf.insert(s)
+
+    print('// @generated GENERATED BY scripts/gen_tokenizer_include.py DO NOT MODIFY')
+    print(f'#ifndef {prefix.upper()}_TOKENIZER_CONFIG_H_')
+    print(f'#define {prefix.upper()}_TOKENIZER_CONFIG_H_')
+    print('#include "bpe.h"')
+    print(f"// buflen {len(buf.buf)}")
+    print(f"constexpr const char {prefix}_buffer[] =\n{c_str_dump(buf.buf)};")
+    avilen = len(tokconfig['added_tokens'])
+    print(f'constexpr std::array<bpecpp::additional_vocab_item_embedded, {avilen}> {prefix}_additional_vocab = ''{{')
+    for last, avi in iter_with_last(tokconfig['added_tokens']):
+        comma = ',' if not last else '' 
+        print('  {'f'.id = {avi["id"]}, .content={buf.get(avi["content"])}, .special={json.dumps(avi["special"])}''}' + comma) 
+    print('}};')
+    print()
+    mergeslen = len(tokconfig['model']['merges'])
+    print(f'constexpr std::array<std::pair<bpecpp::buf_ref, bpecpp::buf_ref>, {mergeslen}> {prefix}_merges = ''{{')
+    breaker = BreakEvery(4)
+    for last, (ma, mb) in iter_with_last(split_merges):
+        comma = ',' if not last else '' 
+        print('  {'f'{buf.get(ma)},{buf.get(mb)}''}' + comma + repr(breaker), end='')
+    print('\n}};')
+    vocablen = len(tokconfig['model']['vocab'])
+    print(f'constexpr std::array<bpecpp::buf_ref, {vocablen}> {prefix}_vocab = ''{{')
+    breaker = BreakEvery(8)
+    for last, vi in iter_with_last(tokconfig['model']['vocab']):
+        comma = ',' if not last else '' 
+        print(f'  {buf.get(vi)}' + comma + repr(breaker), end='')
+    print('\n}};')
+    print(f'#endif // {prefix.upper()}_TOKENIZER_CONFIG_H_')
+
+def main():
+    if len(sys.argv) < 3:
+        print(f'Usage: {sys.argv[0]} <hf tokenizer json> <symbol prefix>')
+        sys.exit(1)
+    do_convert(sys.argv[1], sys.argv[2])
+
+if __name__ == '__main__':
+    main()