Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

redo: New tokenizer implementation for MPT and GPT-J #765

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .codespellrc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[codespell]
skip = .git,*.pdf,*.svg
skip = .git,*.pdf,*.svg,*_tokenizer_config.h
#
# ignore-words-list =
8 changes: 6 additions & 2 deletions gpt4all-backend/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL
project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)

set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_STANDARD 17)

set(LLAMA_BUILD_EXAMPLES ON CACHE BOOL "llama: build examples" FORCE)
set(BUILD_SHARED_LIBS ON FORCE)
Expand All @@ -34,19 +35,22 @@ if (GPT4ALL_AVX_ONLY)
set(LLAMA_FMA OFF CACHE BOOL "llama: enable FMA" FORCE)
endif()

find_package(ICU REQUIRED COMPONENTS uc i18n)
add_subdirectory(llama.cpp)

add_library(llmodel
gptj.h gptj.cpp
llamamodel.h llamamodel.cpp
llama.cpp/examples/common.cpp
llmodel.h llmodel_c.h llmodel_c.cpp
mpt.h mpt.cpp
mpt.h mpt.cpp tokenizer/bpe.cpp tokenizer/bpe.h
tokenizer/mpt_tokenizer_config.h tokenizer/gptj_tokenizer_config.h
utils.h utils.cpp
)

target_link_libraries(llmodel
PRIVATE llama)
PRIVATE llama
PUBLIC ICU::uc ICU::i18n)

set_target_properties(llmodel PROPERTIES
VERSION ${PROJECT_VERSION}
Expand Down
10 changes: 7 additions & 3 deletions gpt4all-backend/gptj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <cmath>
#include <cstdio>
#include <cstring>
#include <filesystem>
#include <fstream>
#include <map>
#include <string>
Expand Down Expand Up @@ -860,6 +861,8 @@ bool GPTJ::loadModel(const std::string &modelPath) {
d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
d_ptr->modelLoaded = true;
fflush(stdout);

get_bpecpp_tokenizer(TokenizerType::GPTJ, m_bpe, m_tokav);
return true;
}

Expand Down Expand Up @@ -915,7 +918,7 @@ void GPTJ::prompt(const std::string &prompt,
int64_t t_prompt_us = 0;

// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(d_ptr->vocab, prompt);
std::vector<uint32_t> embd_inp = m_tokav->encode(prompt, *m_bpe);

// save the context size
promptCtx.n_ctx = d_ptr->model->hparams.n_ctx;
Expand Down Expand Up @@ -1032,7 +1035,7 @@ void GPTJ::prompt(const std::string &prompt,
if (id == 50256 /*end of text*/)
goto stop_generating;

const std::string str = d_ptr->vocab.id_to_token[id];
const std::string str = m_tokav->decode({(uint32_t) id}, *m_bpe, true, false);

// Check if the provided str is part of our reverse prompts
bool foundPartialReversePrompt = false;
Expand Down Expand Up @@ -1062,7 +1065,8 @@ void GPTJ::prompt(const std::string &prompt,
if (promptCtx.tokens.size() == promptCtx.n_ctx)
promptCtx.tokens.erase(promptCtx.tokens.begin());
promptCtx.tokens.push_back(t);
if (!responseCallback(t, d_ptr->vocab.id_to_token[t]))
const std::string decoded = m_tokav->decode({(uint32_t) t}, *m_bpe, true, false);
if (!responseCallback(t, decoded))
goto stop_generating;
}
cachedTokens.clear();
Expand Down
3 changes: 3 additions & 0 deletions gpt4all-backend/gptj.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <functional>
#include <vector>
#include "llmodel.h"
#include "tokenizer/bpe.h"

class GPTJPrivate;
class GPTJ : public LLModel {
Expand All @@ -31,6 +32,8 @@ class GPTJ : public LLModel {

private:
GPTJPrivate *d_ptr;
std::unique_ptr<bpecpp::AdditionalVocabAdapter> m_tokav;
std::unique_ptr<bpecpp::BPE> m_bpe;
};

#endif // GPTJ_H
15 changes: 12 additions & 3 deletions gpt4all-backend/mpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <cmath>
#include <cstdio>
#include <cstring>
#include <filesystem>
#include <fstream>
#include <map>
#include <random>
Expand Down Expand Up @@ -785,6 +786,12 @@ bool MPT::loadModel(const std::string &modelPath) {
d_ptr->modelLoaded = true;
d_ptr->has_im_end = d_ptr->vocab.token_to_id.find("<|im_end|>") != d_ptr->vocab.token_to_id.end();
fflush(stdout);

if (modelPath.find("-chat") != std::string::npos) {
get_bpecpp_tokenizer(TokenizerType::MPT_CHAT, m_bpe, m_tokav);
} else {
get_bpecpp_tokenizer(TokenizerType::MPT, m_bpe, m_tokav);
}
return true;
}

Expand Down Expand Up @@ -840,7 +847,7 @@ void MPT::prompt(const std::string &prompt,
int64_t t_prompt_us = 0;

// tokenize the prompt
std::vector<int> embd_inp = gpt_tokenize(d_ptr->vocab, prompt);
std::vector<uint32_t> embd_inp = m_tokav->encode(prompt, *m_bpe);

// save the context size
promptCtx.n_ctx = d_ptr->model->hparams.n_ctx;
Expand Down Expand Up @@ -906,6 +913,7 @@ void MPT::prompt(const std::string &prompt,
int r_instructFound = 0;

std::string cachedResponse;
std::string decodeBuffer;
std::vector<int> cachedTokens;
std::unordered_set<std::string> reversePrompts
= { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };
Expand Down Expand Up @@ -961,7 +969,7 @@ void MPT::prompt(const std::string &prompt,
if (id == 0 /*end of text*/)
goto stop_generating;

const std::string str = d_ptr->vocab.id_to_token[id];
const std::string str = m_tokav->decode({(uint32_t) id}, *m_bpe, true, false);

// Check if the provided str is part of our reverse prompts
bool foundPartialReversePrompt = false;
Expand Down Expand Up @@ -991,7 +999,8 @@ void MPT::prompt(const std::string &prompt,
if (promptCtx.tokens.size() == promptCtx.n_ctx)
promptCtx.tokens.erase(promptCtx.tokens.begin());
promptCtx.tokens.push_back(t);
if (!responseCallback(t, d_ptr->vocab.id_to_token[t]))
const std::string decoded = m_tokav->decode({(uint32_t) t}, *m_bpe, true, false);
if (!responseCallback(t, decoded))
goto stop_generating;
}
cachedTokens.clear();
Expand Down
3 changes: 3 additions & 0 deletions gpt4all-backend/mpt.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <functional>
#include <vector>
#include "llmodel.h"
#include "tokenizer/bpe.h"

class MPTPrivate;
class MPT : public LLModel {
Expand All @@ -31,6 +32,8 @@ class MPT : public LLModel {

private:
MPTPrivate *d_ptr;
std::unique_ptr<bpecpp::AdditionalVocabAdapter> m_tokav;
std::unique_ptr<bpecpp::BPE> m_bpe;
};

#endif // MPT_H
136 changes: 136 additions & 0 deletions gpt4all-backend/scripts/gen_tokenizer_include.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import sys
import json
from dataclasses import dataclass

def iter_with_last(lst):
llen = len(lst)
for i, entry in enumerate(lst):
last = i == (llen - 1)
yield last, entry

@dataclass
class BufSlice:
offset: int
length: int
def __repr__(self):
return '{'f'0x{self.offset:x},{self.length}''}'

def c_str_dump(bs):
s = bytearray()
s += b'"'
llen = 0
lasthex = False
for byte in bs:
if byte in (b' 01234567890abcdefghijklmnopqrstuvwxyz_-=/;:<>'
b'ABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*(),.[]{}`~|'):
# need to avoid hex characters not part of a hex escape
# appearing directly after a hex scape
if lasthex and byte in b'0123456789abcdefABCDEF':
s += b'""'
llen += 2
s += bytes([byte])
llen += 1
lasthex = False
else:
s += f'\\x{byte:02x}'.encode('utf8')
llen += 4
lasthex = True
if llen >= 80:
llen = 0
s += b"\"\n\""
s += b'"'
return s.decode('utf8')

class Buf:
def __init__(self):
self.buf = b''
self.cache = {}

def get(self, s):
if s in self.cache:
return self.cache[s]
offset = len(self.buf)
bs = s.encode('utf8')
exoffs = self.buf.find(bs)
if exoffs != -1:
slc = BufSlice(offset=exoffs, length=len(bs))
self.cache[s] = slc
return slc
return None

def insert(self, s):
slc = self.get(s)
if slc is None:
bs = s.encode('utf8')
offset = len(self.buf)
self.buf += bs
slc = BufSlice(offset=offset, length=len(bs))
return slc

class BreakEvery:
def __init__(self, n):
self.counter = 0
self.n = n

def __repr__(self):
self.counter += 1
self.counter %= self.n
if self.counter == 0:
return '\n'
return ''

def do_convert(tkfilename, prefix):
with open(tkfilename, 'rb') as tkf:
tokconfig = json.load(tkf)

# every string in the vocab also appears in the merges list so we can store
# much less data in the binary by deduplicating these references, sorting by
# length descending makes it more likely prefixes of longer strings get
# deduped, and secondarily sorting lexicographically them makes the buffer
# data more compressible (they are not compressed in the binary itself, but
# the binary will be more compressible)
split_merges = [s.split(' ') for s in tokconfig['model']['merges']]
len_then = lambda m: (len(m),m)
avwords = sorted((av['content'] for av in tokconfig['added_tokens']), key=len_then, reverse=True)
all_strs = avwords + sorted(list(tokconfig['model']['vocab'].keys()), key=len_then, reverse=True)
buf = Buf()
for s in all_strs:
buf.insert(s)

print('// @generated GENERATED BY scripts/gen_tokenizer_include.py DO NOT MODIFY')
print(f'#ifndef {prefix.upper()}_TOKENIZER_CONFIG_H_')
print(f'#define {prefix.upper()}_TOKENIZER_CONFIG_H_')
print('#include "bpe.h"')
print(f"// buflen {len(buf.buf)}")
print(f"constexpr const char {prefix}_buffer[] =\n{c_str_dump(buf.buf)};")
avilen = len(tokconfig['added_tokens'])
print(f'constexpr std::array<bpecpp::additional_vocab_item_embedded, {avilen}> {prefix}_additional_vocab = ''{{')
for last, avi in iter_with_last(tokconfig['added_tokens']):
comma = ',' if not last else ''
print(' {'f'.id = {avi["id"]}, .content={buf.get(avi["content"])}, .special={json.dumps(avi["special"])}''}' + comma)
print('}};')
print()
mergeslen = len(tokconfig['model']['merges'])
print(f'constexpr std::array<std::pair<bpecpp::buf_ref, bpecpp::buf_ref>, {mergeslen}> {prefix}_merges = ''{{')
breaker = BreakEvery(4)
for last, (ma, mb) in iter_with_last(split_merges):
comma = ',' if not last else ''
print(' {'f'{buf.get(ma)},{buf.get(mb)}''}' + comma + repr(breaker), end='')
print('\n}};')
vocablen = len(tokconfig['model']['vocab'])
print(f'constexpr std::array<bpecpp::buf_ref, {vocablen}> {prefix}_vocab = ''{{')
breaker = BreakEvery(8)
for last, vi in iter_with_last(tokconfig['model']['vocab']):
comma = ',' if not last else ''
print(f' {buf.get(vi)}' + comma + repr(breaker), end='')
print('\n}};')
print(f'#endif // {prefix.upper()}_TOKENIZER_CONFIG_H_')

def main():
if len(sys.argv) < 3:
print(f'Usage: {sys.argv[0]} <hf tokenizer json> <symbol prefix>')
sys.exit(1)
do_convert(sys.argv[1], sys.argv[2])

if __name__ == '__main__':
main()
Loading