support btok tokenization for simple unicode

wrongbad · Jun 15, 2024 · 074bf41 · 074bf41
1 parent 421f97f
commit 074bf41
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -14,3 +14,4 @@ Weights are stored in the binary as bfloat16, and unpacked to float32 at runtime
 
 The bundled tensor math lib uses compile-time shapes and in-place storage, so there is no dynamic memory allocation at all.
 
+Headers are installed with python packge, can be found with `python -m torch2cpp.includes`
diff --git a/example/Makefile b/example/Makefile
@@ -1,13 +1,13 @@
 INCLUDES := $(shell python -m torch2cpp.includes)
 
 build/model.js : build/model.cpp
-	em++ -Os build/model.cpp -I$(INCLUDES) \
-		-o build/model.js -s MODULARIZE=1 -s EXPORT_NAME=load_model \
-		-s EXPORTED_FUNCTIONS=_model_step,_model_reset,_model_encode,_model_decode
+	em++ $^ -o $@ -I$(INCLUDES) \
+		-Os -s MODULARIZE=1 -s EXPORT_NAME=load_model \
+		-s EXPORTED_FUNCTIONS=_model_step,_model_reset,_model_encode,_model_decode,_malloc
 
-build/chat_cli : chat_cli.cpp build/model.cpp 
-	c++ -std=c++17 -Os -march=native -ffast-math \
-		build/model.cpp chat_cli.cpp -I$(INCLUDES) -o build/chat_cli
+build/chat_cli : build/model.cpp chat_cli.cpp
+	c++ $^ -o $@ -I$(INCLUDES) \
+		-Os -std=c++17 -march=native -ffast-math
 
 .PHONY: model.js
 model.js: build/model.js

diff --git a/example/chat_cli.cpp b/example/chat_cli.cpp
@@ -18,9 +18,15 @@ int main(int argc, char ** argv)
     while(true)
     {
         std::getline(std::cin, prompt);
-        prompt += "\n";
+        // prompt += "\n";
 
         int n_tok = model_encode(prompt.c_str(), prompt.size(), toks, max_tokens);
+
+        std::cout << n_tok << std::endl;
+        for(int i=0 ; i<n_tok ; i++)
+            std::cout << toks[i] << ", ";
+        std::cout << std::endl;
+
         for(int i=0 ; i<n_tok-1 ; i++)
         {
             model_step(toks[i], 0);

diff --git a/example/sqrll2cpp.ipynb b/example/sqrll2cpp.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -16,18 +16,21 @@
    "source": [
     "import torch\n",
     "from sqrll.sqrllm import SqrLLM\n",
+    "import struct\n",
     "\n",
-    "model_file = '../../sqrll/example/models/model2048bpe.pt'\n",
+    "model_file = '../../sqrll/example/models/model2048wu.pt'\n",
     "\n",
     "model = SqrLLM.load(model_file).eval()\n",
     "\n",
+    "tokenizer = None\n",
+    "\n",
     "params = sum(p.numel() for p in model.parameters())\n",
     "print(f'{params=:,}')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -39,18 +42,24 @@
     }
    ],
    "source": [
-    "from tokenizers import Tokenizer\n",
+    "# from tokenizers import Tokenizer\n",
+    "# token_file = '../../sqrll/example/models/tokenizer2048.json'\n",
+    "# tokenizer = Tokenizer.from_file(token_file)\n",
+    "# n_vocab = tokenizer.get_vocab_size()\n",
     "\n",
-    "token_file = '../../sqrll/example/models/tokenizer2048.json'\n",
+    "from btok import Tokenizer\n",
+    "token_file = '../../sqrll/example/models/bpe2048wu.pack'\n",
+    "with open(token_file, 'rb') as f:\n",
+    "    tokenizer = Tokenizer(f.read())\n",
+    "n_vocab = tokenizer.num_tokens()\n",
+    "tokens = [tokenizer.token(i) for i in range(n_vocab)]\n",
     "\n",
-    "tokenizer = Tokenizer.from_file(token_file)\n",
-    "n_vocab = tokenizer.get_vocab_size()\n",
     "print(f'{n_vocab=}')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -66,21 +75,33 @@
     "        model,\n",
     "        out_file,\n",
     "        args=[inputs, mem],\n",
-    "        tokenizer=tokenizer,\n",
+    "        tokens=tokens,\n",
     "        autowrap_functions=[\n",
     "            sqrll.sqrll_kernel,\n",
     "            sqrllm.rms_norm,\n",
     "        ],\n",
-    "        skip_weights=False,\n",
     "    )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "em++ build/model.cpp -o build/model.js -I/home/k/dev/torch2cpp/src/torch2cpp/include \\\n",
+      "\t-Os -s MODULARIZE=1 -s EXPORT_NAME=load_model \\\n",
+      "\t-s EXPORTED_FUNCTIONS=_model_step,_model_reset,_model_encode,_model_decode,_malloc\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "make model.js"
+   ]
   }
  ],
  "metadata": {

diff --git a/src/torch2cpp/include/torch2cpp/tokenizer.h b/src/torch2cpp/include/torch2cpp/tokenizer.h
@@ -1,7 +1,5 @@
 #include <cstdint>
 
-#include <iostream>
-
 template<int NTOK, int NTREE>
 struct Tokenizer
 {

diff --git a/src/torch2cpp/torch2cpp.py b/src/torch2cpp/torch2cpp.py
@@ -144,7 +144,7 @@ def codegen(
         out_file,
         args=[],
         kwargs={},
-        tokenizer=None,
+        tokens=None,
         autowrap_functions=[],
         c_prefix='model',
         skip_weights=False,
@@ -159,10 +159,9 @@ def codegen(
     out = interp.run(*args, **kwargs)
 
 
-    if tokenizer is not None:
-        n_vocab = tokenizer.get_vocab_size()
-        vocab = tokenizer.decode_batch([[i] for i in range(n_vocab)])
-        vocab = [bytes(t, 'utf8') for t in vocab]
+    if tokens is not None:
+        n_vocab = len(tokens)
+        vocab = tokens
         token_pack = [struct.pack('B',len(t))+t for t in vocab]
         token_pack = [hex(c) for tok in token_pack for c in tok]
         token_pack = ','.join(token_pack)
@@ -217,7 +216,7 @@ def __exit__(self, *_):
     w = Writer(out_file)
 
     w('#include "torch2cpp/tensor.h"')
-    if tokenizer is not None:
+    if tokens is not None:
         w('#include "torch2cpp/tokenizer.h"')
     w('\n')
 
@@ -229,7 +228,7 @@ def __exit__(self, *_):
             w(','.join([hex(x) for x in blob]))
         w(';')
 
-        if tokenizer is not None:
+        if tokens is not None:
             w(f'uint8_t const g_token_pack[] = {{ {token_pack} }};')
 
     w('// weight tensors')
@@ -268,7 +267,7 @@ def __exit__(self, *_):
     w(f'ml::rng64 g_rng;')
     w(f'{class_name} g_model;')
 
-    if tokenizer is not None:
+    if tokens is not None:
         w(f'Tokenizer<{n_vocab}, {n_trees}> g_tokenizer = {{ g_token_pack }};')
     w('\n')
     w('} // namespace\n')
@@ -290,7 +289,7 @@ def __exit__(self, *_):
 }}
 ''')
 
-    if tokenizer is not None:
+    if tokens is not None:
         w(f'''
 int {c_prefix}_encode(char const* str, int str_len, int * out, int out_len)
 {{
@@ -300,6 +299,23 @@ def __exit__(self, *_):
 {{
     return g_tokenizer.decode(toks, toks_len, out, out_len);
 }}
+''')
+    else:
+        w(f'''
+int {c_prefix}_encode(char const* str, int str_len, int * out, int out_len)
+{{
+    int i = 0;
+    for(; i<str_len && i<out_len ; i++)
+        out[i] = uint8_t(str[i]);
+    return i;
+}}
+int {c_prefix}_decode(int const* toks, int toks_len, char * out, int out_len)
+{{
+    int i = 0;
+    for(; i<toks_len && i<out_len ; i++)
+        out[i] = uint8_t(toks[i]);
+    return i;
+}}
 ''')
 
     w('} // extern C\n')
Original file line number	Diff line number	Diff line change
Expand Up		@@ -14,3 +14,4 @@ Weights are stored in the binary as bfloat16, and unpacked to float32 at runtime

		The bundled tensor math lib uses compile-time shapes and in-place storage, so there is no dynamic memory allocation at all.

		Headers are installed with python packge, can be found with `python -m torch2cpp.includes`