From a2d4d1913cd1f862c5dd8d3d7a9f90b3aed9079e Mon Sep 17 00:00:00 2001
From: Mathijs Henquet <mathijs.henquet@nlr.nl>
Date: Tue, 20 Aug 2024 23:28:06 +0200
Subject: [PATCH 1/8] server : added with_pieces functionality to /tokenize
 endpoint

---
 examples/server/README.md                     | 29 +++++++++++++++++--
 examples/server/server.cpp                    | 20 +++++++++++--
 examples/server/tests/features/steps/steps.py | 26 +++++++++++++++++
 examples/server/utils.hpp                     |  2 +-
 4 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 930ae15f64d8b..dba47d94d315a 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -500,9 +500,34 @@ Notice that each `probs` is an array of length `n_probs`.
 
     *Options:*
 
-    `content`: Set the text to tokenize.
+    `content`: (Required) The text to tokenize.
+    
+    `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
 
-    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+    `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
+
+**Response:**
+
+Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter.
+
+
+If `with_pieces` is `false`:
+```json
+{
+  "tokens": [123, 456, 789]
+}
+```
+
+If `with_pieces` is `true`:
+```json
+{
+  "tokens": [
+    {"id": 123, "piece": "Hello"},
+    {"id": 456, "piece": " world"},
+    {"id": 789, "piece": "!"}
+  ]
+}
+```
 
 ### POST `/detokenize`: Convert tokens to text
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ce711eadd29ac..5430924092c21 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3189,12 +3189,26 @@ int main(int argc, char ** argv) {
     const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
         const json body = json::parse(req.body);
 
-        std::vector<llama_token> tokens;
+        json tokens_response = json::array();
         if (body.count("content") != 0) {
             const bool add_special = json_value(body, "add_special", false);
-            tokens = ctx_server.tokenize(body.at("content"), add_special);
+            const bool with_pieces = json_value(body, "with_pieces", false);
+            std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
+
+            if (with_pieces) {
+                for (const auto& token : tokens) {
+                    std::string piece = llama_token_to_piece(ctx_server.ctx, token);
+                    tokens_response.push_back({
+                        {"id", token},
+                        {"piece", piece}
+                    });
+                }
+            } else {
+                tokens_response = tokens;
+            }
         }
-        const json data = format_tokenizer_response(tokens);
+        
+        const json data = format_tokenizer_response(tokens_response);
         return res.set_content(data.dump(), MIMETYPE_JSON);
     };
 
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 1ba7b60b69c46..fec6bcae5ba08 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -685,6 +685,32 @@ def step_tokenize_set_add_special(context):
     context.tokenize_add_special = True
 
 
+@step("tokenizing with pieces")
+@async_run_until_complete
+async def step_tokenize_with_pieces(context):
+    context.tokenized_text = context_text(context)
+    async with aiohttp.ClientSession() as session:
+        tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
+        if getattr(context, "tokenize_add_special", None) is not None:
+            tokenize_args["add_special"] = context.tokenize_add_special
+
+        async with session.post(
+            f"{context.base_url}/tokenize", json=tokenize_args
+        ) as response:
+            assert response.status == 200
+            tokenize_json = await response.json()
+            context.tokens_with_pieces = tokenize_json["tokens"]
+
+
+@step("tokens with pieces are complete")
+@async_run_until_complete
+async def step_tokenize_with_pieces(context):
+    # Verify that the response contains both token IDs and pieces
+    assert all(
+        "id" in token and "piece" in token for token in context.tokens_with_pieces
+    )
+
+
 @step('tokenizing')
 @async_run_until_complete
 async def step_tokenize(context):
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index e6a1f069723ec..42635accadf6f 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -583,7 +583,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
     return res;
 }
 
-static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
+static json format_tokenizer_response(const json & tokens) {
     return json {
         {"tokens", tokens}
     };

From 198daa4e3434958a56473a23c7d72906bbf7d352 Mon Sep 17 00:00:00 2001
From: Mathijs Henquet <mathijs.henquet@nlr.nl>
Date: Thu, 22 Aug 2024 00:04:21 +0200
Subject: [PATCH 2/8] server : Add tokenize with pieces tests to server.feature

---
 examples/server/tests/features/server.feature | 10 +++++++++-
 examples/server/tests/features/steps/steps.py |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index b55971454afc3..6ba6f39f3554f 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -104,7 +104,15 @@ Feature: llama.cpp server
     Then  tokens begin with BOS
     Given first token is removed
     Then  tokens can be detokenized
-
+  
+  Scenario: Tokenize with pieces
+    When  tokenizing with pieces:
+    """
+    What is the capital of Germany? 
+    媽
+    """
+    Then  tokens are given with pieces
+  
   Scenario: Models available
     Given available models
     Then  1 models are supported
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index fec6bcae5ba08..da24f47d991c0 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -702,7 +702,7 @@ async def step_tokenize_with_pieces(context):
             context.tokens_with_pieces = tokenize_json["tokens"]
 
 
-@step("tokens with pieces are complete")
+@step("tokens are given with pieces")
 @async_run_until_complete
 async def step_tokenize_with_pieces(context):
     # Verify that the response contains both token IDs and pieces

From b11e63ce43f84ea870daeb18932b1907574ab958 Mon Sep 17 00:00:00 2001
From: Mathijs Henquet <mathijs.henquet@nlr.nl>
Date: Thu, 22 Aug 2024 00:32:28 +0200
Subject: [PATCH 3/8] Handle case if tokenizer splits along utf8 continuation
 bytes

---
 examples/server/README.md  |  2 +-
 examples/server/server.cpp | 15 ++++++++++++++-
 examples/server/utils.hpp  | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index dba47d94d315a..82f9a373f1542 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -508,7 +508,7 @@ Notice that each `probs` is an array of length `n_probs`.
 
 **Response:**
 
-Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter.
+Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
 
 
 If `with_pieces` is `false`:
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 5430924092c21..efb2121e08f2a 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3198,9 +3198,22 @@ int main(int argc, char ** argv) {
             if (with_pieces) {
                 for (const auto& token : tokens) {
                     std::string piece = llama_token_to_piece(ctx_server.ctx, token);
+                    json piece_json;
+
+                    // Check if the piece is valid UTF-8
+                    if (is_valid_utf8(piece)) {
+                        piece_json = piece;
+                    } else {
+                        // If not valid UTF-8, store as array of byte values
+                        piece_json = json::array();
+                        for (unsigned char c : piece) {
+                            piece_json.push_back(static_cast<int>(c));
+                        }
+                    }
+
                     tokens_response.push_back({
                         {"id", token},
-                        {"piece", piece}
+                        {"piece", piece_json}
                     });
                 }
             } else {
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 42635accadf6f..6f81e4e6ba485 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -583,6 +583,39 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
     return res;
 }
 
+static bool is_valid_utf8(const std::string & str) {
+    const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
+    const unsigned char* end = bytes + str.length();
+
+    while (bytes < end) {
+        if (*bytes <= 0x7F) {
+            // 1-byte sequence (0xxxxxxx)
+            bytes++;
+        } else if ((*bytes & 0xE0) == 0xC0) {
+            // 2-byte sequence (110xxxxx 10xxxxxx)
+            if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
+                return false;
+            bytes += 2;
+        } else if ((*bytes & 0xF0) == 0xE0) {
+            // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
+                return false;
+            bytes += 3;
+        } else if ((*bytes & 0xF8) == 0xF0) {
+            // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 || 
+                (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
+                return false;
+            bytes += 4;
+        } else {
+            // Invalid UTF-8 lead byte
+            return false;
+        }
+    }
+
+    return true;
+}
+
 static json format_tokenizer_response(const json & tokens) {
     return json {
         {"tokens", tokens}

From 42fb6707e85fe2cafe6f744a9110b7391e5b76a8 Mon Sep 17 00:00:00 2001
From: Mathijs Henquet <mathijs.henquet@nlr.nl>
Date: Thu, 22 Aug 2024 00:41:44 +0200
Subject: [PATCH 4/8] Add example of token splitting

---
 examples/server/README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/examples/server/README.md b/examples/server/README.md
index 82f9a373f1542..db3c7f6ffb06f 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -529,6 +529,16 @@ If `with_pieces` is `true`:
 }
 ```
 
+With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
+```json
+{
+  "tokens": [
+    {"id": 198, "piece": [195]}, // hex C3 
+    {"id": 164, "piece": [161]} // hex A1
+  ]
+}
+```
+
 ### POST `/detokenize`: Convert tokens to text
 
     *Options:*

From 0c5baa1cd1cab8deeeb7345dee0bfd642d4ad6be Mon Sep 17 00:00:00 2001
From: Mathijs Henquet <mathijs.henquet@nlr.nl>
Date: Thu, 22 Aug 2024 00:43:30 +0200
Subject: [PATCH 5/8] Remove trailing ws

---
 examples/server/README.md  | 2 +-
 examples/server/server.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index db3c7f6ffb06f..f47a147762bc5 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -501,7 +501,7 @@ Notice that each `probs` is an array of length `n_probs`.
     *Options:*
 
     `content`: (Required) The text to tokenize.
-    
+
     `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
 
     `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index efb2121e08f2a..779357af1b205 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3220,7 +3220,7 @@ int main(int argc, char ** argv) {
                 tokens_response = tokens;
             }
         }
-        
+
         const json data = format_tokenizer_response(tokens_response);
         return res.set_content(data.dump(), MIMETYPE_JSON);
     };

From 0d198bbf98ab3998cb88805842127b9dd2e74852 Mon Sep 17 00:00:00 2001
From: Mathijs Henquet <mathijs.henquet@nlr.nl>
Date: Thu, 22 Aug 2024 09:49:26 +0200
Subject: [PATCH 6/8] Fix trailing ws

---
 examples/server/README.md                     | 2 +-
 examples/server/tests/features/server.feature | 6 +++---
 examples/server/utils.hpp                     | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index f47a147762bc5..bdb1e074b2de0 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -533,7 +533,7 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
 ```json
 {
   "tokens": [
-    {"id": 198, "piece": [195]}, // hex C3 
+    {"id": 198, "piece": [195]}, // hex C3
     {"id": 164, "piece": [161]} // hex A1
   ]
 }
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index 6ba6f39f3554f..15e24c624af37 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -104,15 +104,15 @@ Feature: llama.cpp server
     Then  tokens begin with BOS
     Given first token is removed
     Then  tokens can be detokenized
-  
+
   Scenario: Tokenize with pieces
     When  tokenizing with pieces:
     """
-    What is the capital of Germany? 
+    What is the capital of Germany?
     媽
     """
     Then  tokens are given with pieces
-  
+
   Scenario: Models available
     Given available models
     Then  1 models are supported
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 6f81e4e6ba485..31a7e7bf5ff26 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -603,7 +603,7 @@ static bool is_valid_utf8(const std::string & str) {
             bytes += 3;
         } else if ((*bytes & 0xF8) == 0xF0) {
             // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 || 
+            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
                 (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
                 return false;
             bytes += 4;

From 0fbed972aa3f1edc875b4dc4ea5be8ed4f4fe7c2 Mon Sep 17 00:00:00 2001
From: Mathijs Henquet <mathijs.henquet@nlr.nl>
Date: Thu, 22 Aug 2024 10:19:00 +0200
Subject: [PATCH 7/8] Maybe fix ci

---
 .github/workflows/server.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 99feb28f2a545..1b00f3f108298 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -173,6 +173,7 @@ jobs:
         if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
         run: |
           cd examples/server/tests
+          set PYTHONIOENCODING=utf-8
           behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
 
       - name: Slow tests

From 661a740d557b2c13b48648446ae085d14a7feb4a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 12 Sep 2024 21:58:24 +0200
Subject: [PATCH 8/8] maybe this fix windows ci?

---
 .github/workflows/server.yml                  | 2 +-
 examples/server/tests/features/steps/steps.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 1b00f3f108298..29f8fd4443119 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -173,7 +173,7 @@ jobs:
         if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
         run: |
           cd examples/server/tests
-          set PYTHONIOENCODING=utf-8
+          $env:PYTHONIOENCODING = ":replace"
           behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
 
       - name: Slow tests
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 0981aa0a4d41d..11587dd64075a 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1,3 +1,6 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
 import asyncio
 import json
 import os