From a2d4d1913cd1f862c5dd8d3d7a9f90b3aed9079e Mon Sep 17 00:00:00 2001 From: Mathijs Henquet Date: Tue, 20 Aug 2024 23:28:06 +0200 Subject: [PATCH 1/8] server : added with_pieces functionality to /tokenize endpoint --- examples/server/README.md | 29 +++++++++++++++++-- examples/server/server.cpp | 20 +++++++++++-- examples/server/tests/features/steps/steps.py | 26 +++++++++++++++++ examples/server/utils.hpp | 2 +- 4 files changed, 71 insertions(+), 6 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 930ae15f64d8b..dba47d94d315a 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -500,9 +500,34 @@ Notice that each `probs` is an array of length `n_probs`. *Options:* - `content`: Set the text to tokenize. + `content`: (Required) The text to tokenize. + + `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` - `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` + `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false` + +**Response:** + +Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. + + +If `with_pieces` is `false`: +```json +{ + "tokens": [123, 456, 789] +} +``` + +If `with_pieces` is `true`: +```json +{ + "tokens": [ + {"id": 123, "piece": "Hello"}, + {"id": 456, "piece": " world"}, + {"id": 789, "piece": "!"} + ] +} +``` ### POST `/detokenize`: Convert tokens to text diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ce711eadd29ac..5430924092c21 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3189,12 +3189,26 @@ int main(int argc, char ** argv) { const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { const json body = json::parse(req.body); - std::vector tokens; + json tokens_response = json::array(); if (body.count("content") != 0) { const bool add_special = json_value(body, "add_special", false); - tokens = ctx_server.tokenize(body.at("content"), add_special); + const bool with_pieces = json_value(body, "with_pieces", false); + std::vector tokens = ctx_server.tokenize(body.at("content"), add_special); + + if (with_pieces) { + for (const auto& token : tokens) { + std::string piece = llama_token_to_piece(ctx_server.ctx, token); + tokens_response.push_back({ + {"id", token}, + {"piece", piece} + }); + } + } else { + tokens_response = tokens; + } } - const json data = format_tokenizer_response(tokens); + + const json data = format_tokenizer_response(tokens_response); return res.set_content(data.dump(), MIMETYPE_JSON); }; diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 1ba7b60b69c46..fec6bcae5ba08 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -685,6 +685,32 @@ def step_tokenize_set_add_special(context): context.tokenize_add_special = True +@step("tokenizing with pieces") +@async_run_until_complete +async def step_tokenize_with_pieces(context): + context.tokenized_text = context_text(context) + async with aiohttp.ClientSession() as session: + tokenize_args = {"content": context.tokenized_text, "with_pieces": True} + if getattr(context, "tokenize_add_special", None) is not None: + tokenize_args["add_special"] = context.tokenize_add_special + + async with session.post( + f"{context.base_url}/tokenize", json=tokenize_args + ) as response: + assert response.status == 200 + tokenize_json = await response.json() + context.tokens_with_pieces = tokenize_json["tokens"] + + +@step("tokens with pieces are complete") +@async_run_until_complete +async def step_tokenize_with_pieces(context): + # Verify that the response contains both token IDs and pieces + assert all( + "id" in token and "piece" in token for token in context.tokens_with_pieces + ) + + @step('tokenizing') @async_run_until_complete async def step_tokenize(context): diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index e6a1f069723ec..42635accadf6f 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -583,7 +583,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso return res; } -static json format_tokenizer_response(const std::vector & tokens) { +static json format_tokenizer_response(const json & tokens) { return json { {"tokens", tokens} }; From 198daa4e3434958a56473a23c7d72906bbf7d352 Mon Sep 17 00:00:00 2001 From: Mathijs Henquet Date: Thu, 22 Aug 2024 00:04:21 +0200 Subject: [PATCH 2/8] server : Add tokenize with pieces tests to server.feature --- examples/server/tests/features/server.feature | 10 +++++++++- examples/server/tests/features/steps/steps.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index b55971454afc3..6ba6f39f3554f 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -104,7 +104,15 @@ Feature: llama.cpp server Then tokens begin with BOS Given first token is removed Then tokens can be detokenized - + + Scenario: Tokenize with pieces + When tokenizing with pieces: + """ + What is the capital of Germany? + 媽 + """ + Then tokens are given with pieces + Scenario: Models available Given available models Then 1 models are supported diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index fec6bcae5ba08..da24f47d991c0 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -702,7 +702,7 @@ async def step_tokenize_with_pieces(context): context.tokens_with_pieces = tokenize_json["tokens"] -@step("tokens with pieces are complete") +@step("tokens are given with pieces") @async_run_until_complete async def step_tokenize_with_pieces(context): # Verify that the response contains both token IDs and pieces From b11e63ce43f84ea870daeb18932b1907574ab958 Mon Sep 17 00:00:00 2001 From: Mathijs Henquet Date: Thu, 22 Aug 2024 00:32:28 +0200 Subject: [PATCH 3/8] Handle case if tokenizer splits along utf8 continuation bytes --- examples/server/README.md | 2 +- examples/server/server.cpp | 15 ++++++++++++++- examples/server/utils.hpp | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index dba47d94d315a..82f9a373f1542 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -508,7 +508,7 @@ Notice that each `probs` is an array of length `n_probs`. **Response:** -Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. +Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise. If `with_pieces` is `false`: diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5430924092c21..efb2121e08f2a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3198,9 +3198,22 @@ int main(int argc, char ** argv) { if (with_pieces) { for (const auto& token : tokens) { std::string piece = llama_token_to_piece(ctx_server.ctx, token); + json piece_json; + + // Check if the piece is valid UTF-8 + if (is_valid_utf8(piece)) { + piece_json = piece; + } else { + // If not valid UTF-8, store as array of byte values + piece_json = json::array(); + for (unsigned char c : piece) { + piece_json.push_back(static_cast(c)); + } + } + tokens_response.push_back({ {"id", token}, - {"piece", piece} + {"piece", piece_json} }); } } else { diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 42635accadf6f..6f81e4e6ba485 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -583,6 +583,39 @@ static json format_embeddings_response_oaicompat(const json & request, const jso return res; } +static bool is_valid_utf8(const std::string & str) { + const unsigned char* bytes = reinterpret_cast(str.data()); + const unsigned char* end = bytes + str.length(); + + while (bytes < end) { + if (*bytes <= 0x7F) { + // 1-byte sequence (0xxxxxxx) + bytes++; + } else if ((*bytes & 0xE0) == 0xC0) { + // 2-byte sequence (110xxxxx 10xxxxxx) + if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80) + return false; + bytes += 2; + } else if ((*bytes & 0xF0) == 0xE0) { + // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx) + if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80) + return false; + bytes += 3; + } else if ((*bytes & 0xF8) == 0xF0) { + // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 || + (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80) + return false; + bytes += 4; + } else { + // Invalid UTF-8 lead byte + return false; + } + } + + return true; +} + static json format_tokenizer_response(const json & tokens) { return json { {"tokens", tokens} From 42fb6707e85fe2cafe6f744a9110b7391e5b76a8 Mon Sep 17 00:00:00 2001 From: Mathijs Henquet Date: Thu, 22 Aug 2024 00:41:44 +0200 Subject: [PATCH 4/8] Add example of token splitting --- examples/server/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/examples/server/README.md b/examples/server/README.md index 82f9a373f1542..db3c7f6ffb06f 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -529,6 +529,16 @@ If `with_pieces` is `true`: } ``` +With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k +```json +{ + "tokens": [ + {"id": 198, "piece": [195]}, // hex C3 + {"id": 164, "piece": [161]} // hex A1 + ] +} +``` + ### POST `/detokenize`: Convert tokens to text *Options:* From 0c5baa1cd1cab8deeeb7345dee0bfd642d4ad6be Mon Sep 17 00:00:00 2001 From: Mathijs Henquet Date: Thu, 22 Aug 2024 00:43:30 +0200 Subject: [PATCH 5/8] Remove trailing ws --- examples/server/README.md | 2 +- examples/server/server.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index db3c7f6ffb06f..f47a147762bc5 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -501,7 +501,7 @@ Notice that each `probs` is an array of length `n_probs`. *Options:* `content`: (Required) The text to tokenize. - + `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false` diff --git a/examples/server/server.cpp b/examples/server/server.cpp index efb2121e08f2a..779357af1b205 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3220,7 +3220,7 @@ int main(int argc, char ** argv) { tokens_response = tokens; } } - + const json data = format_tokenizer_response(tokens_response); return res.set_content(data.dump(), MIMETYPE_JSON); }; From 0d198bbf98ab3998cb88805842127b9dd2e74852 Mon Sep 17 00:00:00 2001 From: Mathijs Henquet Date: Thu, 22 Aug 2024 09:49:26 +0200 Subject: [PATCH 6/8] Fix trailing ws --- examples/server/README.md | 2 +- examples/server/tests/features/server.feature | 6 +++--- examples/server/utils.hpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index f47a147762bc5..bdb1e074b2de0 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -533,7 +533,7 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k ```json { "tokens": [ - {"id": 198, "piece": [195]}, // hex C3 + {"id": 198, "piece": [195]}, // hex C3 {"id": 164, "piece": [161]} // hex A1 ] } diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 6ba6f39f3554f..15e24c624af37 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -104,15 +104,15 @@ Feature: llama.cpp server Then tokens begin with BOS Given first token is removed Then tokens can be detokenized - + Scenario: Tokenize with pieces When tokenizing with pieces: """ - What is the capital of Germany? + What is the capital of Germany? 媽 """ Then tokens are given with pieces - + Scenario: Models available Given available models Then 1 models are supported diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 6f81e4e6ba485..31a7e7bf5ff26 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -603,7 +603,7 @@ static bool is_valid_utf8(const std::string & str) { bytes += 3; } else if ((*bytes & 0xF8) == 0xF0) { // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) - if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 || + if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80) return false; bytes += 4; From 0fbed972aa3f1edc875b4dc4ea5be8ed4f4fe7c2 Mon Sep 17 00:00:00 2001 From: Mathijs Henquet Date: Thu, 22 Aug 2024 10:19:00 +0200 Subject: [PATCH 7/8] Maybe fix ci --- .github/workflows/server.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 99feb28f2a545..1b00f3f108298 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -173,6 +173,7 @@ jobs: if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd examples/server/tests + set PYTHONIOENCODING=utf-8 behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp - name: Slow tests From 661a740d557b2c13b48648446ae085d14a7feb4a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 12 Sep 2024 21:58:24 +0200 Subject: [PATCH 8/8] maybe this fix windows ci? --- .github/workflows/server.yml | 2 +- examples/server/tests/features/steps/steps.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 1b00f3f108298..29f8fd4443119 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -173,7 +173,7 @@ jobs: if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd examples/server/tests - set PYTHONIOENCODING=utf-8 + $env:PYTHONIOENCODING = ":replace" behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp - name: Slow tests diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 0981aa0a4d41d..11587dd64075a 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1,3 +1,6 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + import asyncio import json import os