Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server : Add option to return token pieces in /tokenize endpoint #9108

Merged
merged 9 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ jobs:
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: |
cd examples/server/tests
$env:PYTHONIOENCODING = ":replace"
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp

- name: Slow tests
Expand Down
39 changes: 37 additions & 2 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`.

*Options:*

`content`: Set the text to tokenize.
`content`: (Required) The text to tokenize.

`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`

`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false`

**Response:**

Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.


If `with_pieces` is `false`:
```json
{
"tokens": [123, 456, 789]
}
```

If `with_pieces` is `true`:
```json
{
"tokens": [
{"id": 123, "piece": "Hello"},
{"id": 456, "piece": " world"},
{"id": 789, "piece": "!"}
]
}
```

With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
```json
{
"tokens": [
{"id": 198, "piece": [195]}, // hex C3
{"id": 164, "piece": [161]} // hex A1
]
}
```

### POST `/detokenize`: Convert tokens to text

Expand Down
33 changes: 30 additions & 3 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3013,12 +3013,39 @@ int main(int argc, char ** argv) {
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
const json body = json::parse(req.body);

std::vector<llama_token> tokens;
json tokens_response = json::array();
if (body.count("content") != 0) {
const bool add_special = json_value(body, "add_special", false);
tokens = ctx_server.tokenize(body.at("content"), add_special);
const bool with_pieces = json_value(body, "with_pieces", false);
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);

if (with_pieces) {
for (const auto& token : tokens) {
std::string piece = llama_token_to_piece(ctx_server.ctx, token);
json piece_json;

// Check if the piece is valid UTF-8
if (is_valid_utf8(piece)) {
piece_json = piece;
} else {
// If not valid UTF-8, store as array of byte values
piece_json = json::array();
for (unsigned char c : piece) {
piece_json.push_back(static_cast<int>(c));
}
}

tokens_response.push_back({
{"id", token},
{"piece", piece_json}
});
}
} else {
tokens_response = tokens;
}
}
const json data = format_tokenizer_response(tokens);

const json data = format_tokenizer_response(tokens_response);
res_ok(res, data);
};

Expand Down
8 changes: 8 additions & 0 deletions examples/server/tests/features/server.feature
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ Feature: llama.cpp server
Given first token is removed
Then tokens can be detokenized

Scenario: Tokenize with pieces
When tokenizing with pieces:
"""
What is the capital of Germany?
"""
Then tokens are given with pieces

Scenario: Models available
Given available models
Then 1 models are supported
Expand Down
29 changes: 29 additions & 0 deletions examples/server/tests/features/steps/steps.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import asyncio
import json
import os
Expand Down Expand Up @@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
context.tokenize_add_special = True


@step("tokenizing with pieces")
@async_run_until_complete
ngxson marked this conversation as resolved.
Show resolved Hide resolved
async def step_tokenize_with_pieces(context):
context.tokenized_text = context_text(context)
async with aiohttp.ClientSession() as session:
tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
if getattr(context, "tokenize_add_special", None) is not None:
tokenize_args["add_special"] = context.tokenize_add_special

async with session.post(
f"{context.base_url}/tokenize", json=tokenize_args
) as response:
assert response.status == 200
tokenize_json = await response.json()
context.tokens_with_pieces = tokenize_json["tokens"]


@step("tokens are given with pieces")
@async_run_until_complete
async def step_tokenize_with_pieces(context):
# Verify that the response contains both token IDs and pieces
assert all(
"id" in token and "piece" in token for token in context.tokens_with_pieces
)


@step('tokenizing')
@async_run_until_complete
async def step_tokenize(context):
Expand Down
35 changes: 34 additions & 1 deletion examples/server/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
return res;
}

static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
static bool is_valid_utf8(const std::string & str) {
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
const unsigned char* end = bytes + str.length();

while (bytes < end) {
if (*bytes <= 0x7F) {
// 1-byte sequence (0xxxxxxx)
bytes++;
} else if ((*bytes & 0xE0) == 0xC0) {
// 2-byte sequence (110xxxxx 10xxxxxx)
if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
return false;
bytes += 2;
} else if ((*bytes & 0xF0) == 0xE0) {
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
return false;
bytes += 3;
} else if ((*bytes & 0xF8) == 0xF0) {
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
(bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
return false;
bytes += 4;
} else {
// Invalid UTF-8 lead byte
return false;
}
}

return true;
}

static json format_tokenizer_response(const json & tokens) {
return json {
{"tokens", tokens}
};
Expand Down
Loading