Skip to content

Commit

Permalink
support btok tokenization for simple unicode
Browse files Browse the repository at this point in the history
  • Loading branch information
wrongbad committed Jun 15, 2024
1 parent 421f97f commit 074bf41
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 31 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ Weights are stored in the binary as bfloat16, and unpacked to float32 at runtime

The bundled tensor math lib uses compile-time shapes and in-place storage, so there is no dynamic memory allocation at all.

Headers are installed with python packge, can be found with `python -m torch2cpp.includes`
12 changes: 6 additions & 6 deletions example/Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
INCLUDES := $(shell python -m torch2cpp.includes)

build/model.js : build/model.cpp
em++ -Os build/model.cpp -I$(INCLUDES) \
-o build/model.js -s MODULARIZE=1 -s EXPORT_NAME=load_model \
-s EXPORTED_FUNCTIONS=_model_step,_model_reset,_model_encode,_model_decode
em++ $^ -o $@ -I$(INCLUDES) \
-Os -s MODULARIZE=1 -s EXPORT_NAME=load_model \
-s EXPORTED_FUNCTIONS=_model_step,_model_reset,_model_encode,_model_decode,_malloc

build/chat_cli : chat_cli.cpp build/model.cpp
c++ -std=c++17 -Os -march=native -ffast-math \
build/model.cpp chat_cli.cpp -I$(INCLUDES) -o build/chat_cli
build/chat_cli : build/model.cpp chat_cli.cpp
c++ $^ -o $@ -I$(INCLUDES) \
-Os -std=c++17 -march=native -ffast-math

.PHONY: model.js
model.js: build/model.js
Expand Down
8 changes: 7 additions & 1 deletion example/chat_cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,15 @@ int main(int argc, char ** argv)
while(true)
{
std::getline(std::cin, prompt);
prompt += "\n";
// prompt += "\n";

int n_tok = model_encode(prompt.c_str(), prompt.size(), toks, max_tokens);

std::cout << n_tok << std::endl;
for(int i=0 ; i<n_tok ; i++)
std::cout << toks[i] << ", ";
std::cout << std::endl;

for(int i=0 ; i<n_tok-1 ; i++)
{
model_step(toks[i], 0);
Expand Down
47 changes: 34 additions & 13 deletions example/sqrll2cpp.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand All @@ -16,18 +16,21 @@
"source": [
"import torch\n",
"from sqrll.sqrllm import SqrLLM\n",
"import struct\n",
"\n",
"model_file = '../../sqrll/example/models/model2048bpe.pt'\n",
"model_file = '../../sqrll/example/models/model2048wu.pt'\n",
"\n",
"model = SqrLLM.load(model_file).eval()\n",
"\n",
"tokenizer = None\n",
"\n",
"params = sum(p.numel() for p in model.parameters())\n",
"print(f'{params=:,}')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand All @@ -39,18 +42,24 @@
}
],
"source": [
"from tokenizers import Tokenizer\n",
"# from tokenizers import Tokenizer\n",
"# token_file = '../../sqrll/example/models/tokenizer2048.json'\n",
"# tokenizer = Tokenizer.from_file(token_file)\n",
"# n_vocab = tokenizer.get_vocab_size()\n",
"\n",
"token_file = '../../sqrll/example/models/tokenizer2048.json'\n",
"from btok import Tokenizer\n",
"token_file = '../../sqrll/example/models/bpe2048wu.pack'\n",
"with open(token_file, 'rb') as f:\n",
" tokenizer = Tokenizer(f.read())\n",
"n_vocab = tokenizer.num_tokens()\n",
"tokens = [tokenizer.token(i) for i in range(n_vocab)]\n",
"\n",
"tokenizer = Tokenizer.from_file(token_file)\n",
"n_vocab = tokenizer.get_vocab_size()\n",
"print(f'{n_vocab=}')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -66,21 +75,33 @@
" model,\n",
" out_file,\n",
" args=[inputs, mem],\n",
" tokenizer=tokenizer,\n",
" tokens=tokens,\n",
" autowrap_functions=[\n",
" sqrll.sqrll_kernel,\n",
" sqrllm.rms_norm,\n",
" ],\n",
" skip_weights=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"em++ build/model.cpp -o build/model.js -I/home/k/dev/torch2cpp/src/torch2cpp/include \\\n",
"\t-Os -s MODULARIZE=1 -s EXPORT_NAME=load_model \\\n",
"\t-s EXPORTED_FUNCTIONS=_model_step,_model_reset,_model_encode,_model_decode,_malloc\n"
]
}
],
"source": [
"%%bash\n",
"make model.js"
]
}
],
"metadata": {
Expand Down
2 changes: 0 additions & 2 deletions src/torch2cpp/include/torch2cpp/tokenizer.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#include <cstdint>

#include <iostream>

template<int NTOK, int NTREE>
struct Tokenizer
{
Expand Down
34 changes: 25 additions & 9 deletions src/torch2cpp/torch2cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def codegen(
out_file,
args=[],
kwargs={},
tokenizer=None,
tokens=None,
autowrap_functions=[],
c_prefix='model',
skip_weights=False,
Expand All @@ -159,10 +159,9 @@ def codegen(
out = interp.run(*args, **kwargs)


if tokenizer is not None:
n_vocab = tokenizer.get_vocab_size()
vocab = tokenizer.decode_batch([[i] for i in range(n_vocab)])
vocab = [bytes(t, 'utf8') for t in vocab]
if tokens is not None:
n_vocab = len(tokens)
vocab = tokens
token_pack = [struct.pack('B',len(t))+t for t in vocab]
token_pack = [hex(c) for tok in token_pack for c in tok]
token_pack = ','.join(token_pack)
Expand Down Expand Up @@ -217,7 +216,7 @@ def __exit__(self, *_):
w = Writer(out_file)

w('#include "torch2cpp/tensor.h"')
if tokenizer is not None:
if tokens is not None:
w('#include "torch2cpp/tokenizer.h"')
w('\n')

Expand All @@ -229,7 +228,7 @@ def __exit__(self, *_):
w(','.join([hex(x) for x in blob]))
w(';')

if tokenizer is not None:
if tokens is not None:
w(f'uint8_t const g_token_pack[] = {{ {token_pack} }};')

w('// weight tensors')
Expand Down Expand Up @@ -268,7 +267,7 @@ def __exit__(self, *_):
w(f'ml::rng64 g_rng;')
w(f'{class_name} g_model;')

if tokenizer is not None:
if tokens is not None:
w(f'Tokenizer<{n_vocab}, {n_trees}> g_tokenizer = {{ g_token_pack }};')
w('\n')
w('} // namespace\n')
Expand All @@ -290,7 +289,7 @@ def __exit__(self, *_):
}}
''')

if tokenizer is not None:
if tokens is not None:
w(f'''
int {c_prefix}_encode(char const* str, int str_len, int * out, int out_len)
{{
Expand All @@ -300,6 +299,23 @@ def __exit__(self, *_):
{{
return g_tokenizer.decode(toks, toks_len, out, out_len);
}}
''')
else:
w(f'''
int {c_prefix}_encode(char const* str, int str_len, int * out, int out_len)
{{
int i = 0;
for(; i<str_len && i<out_len ; i++)
out[i] = uint8_t(str[i]);
return i;
}}
int {c_prefix}_decode(int const* toks, int toks_len, char * out, int out_len)
{{
int i = 0;
for(; i<toks_len && i<out_len ; i++)
out[i] = uint8_t(toks[i]);
return i;
}}
''')

w('} // extern C\n')

0 comments on commit 074bf41

Please sign in to comment.