From 0cc6593f10a3eb75bcbe551a68eb78506c3cbbeb Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Fri, 21 Jun 2024 00:00:35 +0200 Subject: [PATCH] Remove previous space --- common/common.cpp | 4 +-- llama.cpp | 92 +++++++++++++++++++++-------------------------- llama.h | 3 +- 3 files changed, 45 insertions(+), 54 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 2217547ab506a..541baf2680436 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2908,10 +2908,10 @@ std::vector llama_tokenize( std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::string piece; piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' - const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), special); + const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); if (n_chars < 0) { piece.resize(-n_chars); - int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), special); + int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); GGML_ASSERT(check == -n_chars); } else { diff --git a/llama.cpp b/llama.cpp index 911f42ed62a53..9a305fb211fcc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1835,18 +1835,19 @@ using llama_mlocks = std::vector>; // NOTE: avoid ever using this except for building the token_to_piece caches static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) { - std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special); - if (n_tokens < 0) { - result.resize(-n_tokens); - int check = llama_token_to_piece(model, token, result.data(), result.size(), special); - GGML_ASSERT(check == -n_tokens); + std::string piece; + piece.resize(piece.capacity()); // using string internal cache + const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special); + if (n_chars < 0) { + piece.resize(-n_chars); + int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special); + GGML_ASSERT(check == -n_chars); } else { - result.resize(n_tokens); + piece.resize(n_chars); } - return std::string(result.data(), result.size()); + return piece; } static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { @@ -18418,23 +18419,33 @@ static std::string llama_decode_text(const std::string & text) { } // does not write null-terminator to buf -int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) { +int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) { // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843 if (!special && llama_is_control_token(model->vocab, token)) { return 0; } + // copy piece chars to output text buffer + // skip up to 'lstrip' leading spaces before copying + auto _try_copy = [=] (const char * token, size_t size) -> int32_t { + for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) { + token++; + size--; + } + if (length < (int32_t)size) { + return (int32_t) -size; + } + memcpy(buf, token, size); + return (int32_t) size; + }; + // if we have a cache - use it { const auto & cache = model->vocab.cache_token_to_piece; if (!cache.empty()) { - const auto & res = cache.at(token); - if (length < (int) res.size()) { - return -(int) res.size(); - } - memcpy(buf, res.c_str(), res.size()); - return res.size(); + const auto & result = cache.at(token); + return _try_copy(result.data(), result.size()); } } @@ -18447,32 +18458,17 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; llama_unescape_whitespace(result); - if (length < (int) result.length()) { - return -(int) result.length(); - } - memcpy(buf, result.c_str(), result.length()); - return result.length(); + return _try_copy(result.data(), result.size()); } else if ( (llama_is_user_defined_token(model->vocab, token)) || (llama_is_control_token (model->vocab, token) && special)) { - std::string result = model->vocab.id_to_token[token].text; - if (length < (int) result.length()) { - return -(int) result.length(); - } - memcpy(buf, result.c_str(), result.length()); - return result.length(); - } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT - if (length < 3) { - return -3; - } - memcpy(buf, "\xe2\x96\x85", 3); - return 3; + const std::string & result = model->vocab.id_to_token[token].text; + return _try_copy(result.data(), result.size()); +/**/ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT +/**/ return _try_copy("\xe2\x96\x85", 3); } else if (llama_is_byte_token(model->vocab, token)) { - if (length < 1) { - return -1; - } - buf[0] = llama_token_to_byte(model->vocab, token); - return 1; + char byte = (char) llama_token_to_byte(model->vocab, token); + return _try_copy((char*)&byte, 1); } break; } @@ -18480,22 +18476,13 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token // NOTE: we accept all unsupported token types, // suppressing them like CONTROL tokens. if (llama_is_normal_token(model->vocab, token)) { - std::string result = model->vocab.id_to_token[token].text; - result = llama_decode_text(result); - if (length < (int) result.length()) { - return -(int) result.length(); - } - memcpy(buf, result.c_str(), result.length()); - return result.length(); + std::string result = llama_decode_text(model->vocab.id_to_token[token].text); + return _try_copy(result.data(), result.size()); } else if ( (llama_is_user_defined_token(model->vocab, token)) || (llama_is_control_token (model->vocab, token) && special)) { - std::string result = model->vocab.id_to_token[token].text; - if (length < (int) result.length()) { - return -(int) result.length(); - } - memcpy(buf, result.c_str(), result.length()); - return result.length(); + const std::string & result = model->vocab.id_to_token[token].text; + return _try_copy(result.data(), result.size()); } break; } @@ -18513,12 +18500,15 @@ int32_t llama_detokenize( char * text, int32_t text_len_max, bool special) { + // remove the leading space of the first non-control token + bool remove_space = model->vocab.tokenizer_add_space_prefix; int32_t avail = text_len_max; int32_t total = 0; for (int32_t i = 0; i < n_tokens; ++i) { GGML_ASSERT(avail >= 0); - int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, special); + int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, special); + remove_space = remove_space && llama_is_control_token(model->vocab, tokens[i]); // until non-control token if (n_chars < 0) { avail = 0; total -= n_chars; diff --git a/llama.h b/llama.h index d86205e76952a..72bd555fd8868 100644 --- a/llama.h +++ b/llama.h @@ -888,13 +888,14 @@ extern "C" { // Token Id -> Piece. // Uses the vocabulary in the provided context. // Does not write null terminator to the buffer. - // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. + // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix') // @param special If true, special tokens are rendered in the output. LLAMA_API int32_t llama_token_to_piece( const struct llama_model * model, llama_token token, char * buf, int32_t length, + int32_t lstrip, bool special); /// @details Convert the provided tokens into text.