From 577e219c50b5444911fd5848c17fdd21d4791640 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 15 Aug 2024 18:53:04 +0100 Subject: [PATCH] llama : suppress conversion from 'size_t' to 'int' This commit updates llama-vocab.cpp llm_tokenizer_spm.tokenize to suppress/remove the following warnings that are generated on Windows when using MSVC: ```console src\llama-vocab.cpp(211,1): warning C4267: 'argument': conversion from 'size_t' to 'int', possible loss of data src\llama-vocab.cpp(517,1): warning C4267: 'argument': conversion from 'size_t' to 'int', possible loss of data ``` This is done by adding a cast for the size_t returned from symbols.size(). I believe this is safe as it seems unlikely that symbols, which stores an entry for each UTF8 character, would become larger than INT_MAX. --- src/llama-vocab.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 749f8571829dfc..146fccf04ef6e0 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -207,7 +207,8 @@ struct llm_tokenizer_spm { } // seed the work queue with all possible 2-character tokens. - for (size_t i = 1; i < symbols.size(); ++i) { + int symbols_size = static_cast(symbols.size()); + for (int i = 1; i < symbols_size; ++i) { try_add_bigram(i - 1, i); } @@ -511,7 +512,8 @@ struct llm_tokenizer_bpe { index++; symbols.emplace_back(sym); } - for (size_t i = 1; i < symbols.size(); ++i) { + int symbols_size = static_cast(symbols.size()); + for (int i = 1; i < symbols.size(); ++i) { add_new_bigram(i - 1, i); }