diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d1dc96276c2a2..05ef0e71f23a7 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: regex_exprs = { "[\r\n]", - "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", + "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", "\\s?[!-/:-~!-/:-~‘-‟ -。]+", "\\s+$", "[一-龥ࠀ-一가-퟿]+", diff --git a/src/unicode.cpp b/src/unicode.cpp index 50b35bbbc918c..2b749c522ca61 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -2,6 +2,11 @@ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING #endif +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#include +#endif + #include "unicode.h" #include "unicode-data.h" @@ -201,8 +206,24 @@ static std::unordered_map unicode_utf8_to_byte_map() { } static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { +#ifdef _WIN32 + int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0); + if (!wlen) { + throw std::invalid_argument("failed to convert regex"); + } + wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t)); + wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen); + if (!wlen) { + free(wbuf); + throw std::invalid_argument("failed to convert regex"); + } + std::wstring ret = std::wstring(wbuf); + free(wbuf); + return ret; +#else std::wstring_convert> conv; return conv.from_bytes(s); +#endif } static std::vector unicode_byte_encoding_process(const std::vector & bpe_words) {