From fead3e5662cb4ef0c780c1d58ad0f4e9b93c3f71 Mon Sep 17 00:00:00 2001 From: Ruihang Lai Date: Mon, 14 Oct 2024 16:47:07 -0400 Subject: [PATCH] [Streamer] Fix UTF-8 handling in streamer (#2978) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR fixes a bug in the streamer handling for UTF-8 characters. Prior to this PR, the streamer has an assumption that a replacement character (`�`) always correspond to an entire token. However, for the Qwen2 model tokenizer, some token can be ` �` if decoded directly, which breaks the assumption and leads to incorrect result generated by the streamer. This PR fixes this issue with a safer behavior that does not rely on such an assumption. --- cpp/tokenizers/streamer.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/tokenizers/streamer.cc b/cpp/tokenizers/streamer.cc index 5986a86631..a70cf7b275 100644 --- a/cpp/tokenizers/streamer.cc +++ b/cpp/tokenizers/streamer.cc @@ -64,7 +64,8 @@ std::string TextStreamerObj::Put(const std::vector& delta_tokens) { 0) { new_pending_tokens.push_back(pending_tokens_.back()); pending_tokens_.pop_back(); - validated_str = validated_str.substr(0, validated_str.length() - 3); + all_tokens.pop_back(); + validated_str = tokenizer_->Decode(all_tokens).substr(prefix_str.length()); } } else { // Case 2. prefix_str is not a prefix of `full_str`.