diff --git a/.gitignore b/.gitignore index b84459b92e86e..601d4aaf571ed 100644 --- a/.gitignore +++ b/.gitignore @@ -90,3 +90,20 @@ examples/jeopardy/results.txt poetry.lock poetry.toml nppBackup + +# Test binaries +/tests/test-grammar-parser +/tests/test-llama-grammar +/tests/test-double-float +/tests/test-grad0 +/tests/test-opt +/tests/test-quantize-fns +/tests/test-quantize-perf +/tests/test-sampling +/tests/test-tokenizer-0-llama +/tests/test-tokenizer-0-falcon +/tests/test-tokenizer-0-deepseek-coder +/tests/test-tokenizer-1-llama +/tests/test-tokenizer-1-bpe +/tests/test-rope +/tests/test-backend-ops diff --git a/Makefile b/Makefile index ba73f063709c7..8872f68efd8fb 100644 --- a/Makefile +++ b/Makefile @@ -8,8 +8,9 @@ BUILD_TARGETS = \ TEST_TARGETS = \ tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ - tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ - tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease + tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek-coder tests/test-tokenizer-0-deepseek-llm \ + tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ + tests/test-backend-ops # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report @@ -52,6 +53,10 @@ test: $(TEST_TARGETS) ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \ elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ + elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-coder" ]; then \ + ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \ + elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-llm" ]; then \ + ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \ elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \ continue; \ elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \ @@ -828,6 +833,12 @@ tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $( $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +tests/test-tokenizer-0-deepseek-coder: tests/test-tokenizer-0-deepseek-coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + +tests/test-tokenizer-0-deepseek-llm: tests/test-tokenizer-0-deepseek-llm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index cae1551a236b0..cca4a4adf9dd9 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -187,6 +187,8 @@ def from_model_architecture(model_architecture): return RefactModel if model_architecture == "PersimmonForCausalLM": return PersimmonModel + if model_architecture == "LlamaForCausalLM": + return DeepseekCoderModel if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return StableLMModel if model_architecture == "QWenLMHeadModel": @@ -211,6 +213,59 @@ def from_model_architecture(model_architecture): return MiniCPMModel if model_architecture == "BertModel": return BertModel + + @staticmethod + def from_model_name(model_name: str): + model_name_lower = model_name.lower() + if model_name_lower in ("stablelmepoch", "llavastablelmepoch"): + return StableLMModel + if model_name_lower == "gptneox": + return GPTNeoXModel + if model_name_lower == "bloom": + return BloomModel + if model_name_lower == "mpt": + return MPTModel + if model_name_lower in ("baichuan"): + return BaichuanModel + if model_name_lower in ("falcon", "rw"): + return FalconModel + if model_name_lower == "gptbigcode": + return StarCoderModel + if model_name_lower == "gptrefact": + return RefactModel + if model_name_lower == "persimmon": + return PersimmonModel + if model_name_lower == "deepseekcoder": + return DeepseekCoderModel + if model_name_lower == "deepseekllm": + return DeepseekLLMModel + return Model + + @staticmethod + def from_model_name(model_name: str): + model_name_lower = model_name.lower() + if model_name_lower in ("stablelmepoch", "llavastablelmepoch"): + return StableLMModel + if model_name_lower == "gptneox": + return GPTNeoXModel + if model_name_lower == "bloom": + return BloomModel + if model_name_lower == "mpt": + return MPTModel + if model_name_lower in ("baichuan"): + return BaichuanModel + if model_name_lower in ("falcon", "rw"): + return FalconModel + if model_name_lower == "gptbigcode": + return StarCoderModel + if model_name_lower == "gptrefact": + return RefactModel + if model_name_lower == "persimmon": + return PersimmonModel + if model_name_lower == "deepseekcoder": + return DeepseekCoderModel + if model_name_lower == "deepseekllm": + return DeepseekLLMModel return Model def _is_model_safetensors(self) -> bool: @@ -244,6 +299,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH: return gguf.MODEL_ARCH.REFACT if arch == "PersimmonForCausalLM": return gguf.MODEL_ARCH.PERSIMMON + if arch == "LlamaForCausalLM": + return gguf.MODEL_ARCH.LLAMA if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return gguf.MODEL_ARCH.STABLELM if arch == "QWenLMHeadModel": @@ -271,7 +328,7 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH: raise NotImplementedError(f'Architecture "{arch}" not supported!') - def _set_vocab_gpt2(self): + def _set_vocab_gpt2(self, tokenizer_model:str = "gpt2"): dir_model = self.dir_model hparams = self.hparams tokens: list[bytearray] = [] @@ -300,7 +357,7 @@ def _set_vocab_gpt2(self): tokens.append(reverse_vocab[i]) toktypes.append(gguf.TokenType.NORMAL) - self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_model(tokenizer_model) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) @@ -1048,6 +1105,29 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) +class DeepseekCoderModel(Model): + def set_gguf_parameters(self): + super().set_gguf_parameters() + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def set_vocab(self): + self._set_vocab_gpt2("deepseek_coder") + +class DeepseekLLMModel(DeepseekCoderModel): + def set_vocab(self): + self._set_vocab_gpt2("deepseek_llm") + class StableLMModel(Model): def set_vocab(self): if (self.dir_model / "tokenizer.json").is_file(): @@ -1749,6 +1829,7 @@ def parse_args() -> argparse.Namespace: "model", type=Path, help="directory containing model file", ) + parser.add_argument("--model-name", type=str, default=None, help="name of the model") return parser.parse_args() diff --git a/llama.cpp b/llama.cpp index a5b873a7bf144..b73eab1ac8b84 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3223,9 +3223,29 @@ static void llm_load_vocab( if (add_space_prefix_keyidx != -1) { vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); } // The default value of add_space_prefix is true. - } else if (tokenizer_name == "gpt2") { - vocab.type = LLAMA_VOCAB_TYPE_BPE; - + } else { + if (tokenizer_name == "gpt2") { + vocab.type = LLAMA_VOCAB_TYPE_BPE; + } else if (tokenizer_name == "deepseek_coder") { + vocab.type = LLAMA_VOCAB_TYPE_DEEPSEEKCODER; + } else if (tokenizer_name == "deepseek_llm") { + vocab.type = LLAMA_VOCAB_TYPE_DEEPSEEKLLM; + } else if (tokenizer_name == "bert") { + vocab.type = LLAMA_VOCAB_TYPE_WPM; + + // default special tokens + vocab.special_bos_id = 101; + vocab.special_eos_id = 102; + vocab.special_unk_id = 100; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + vocab.add_space_prefix = false; + } else { + LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); + LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); + vocab.type = LLAMA_VOCAB_TYPE_SPM; + return; + } // read bpe merges and populate bpe ranks const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str()); if (merges_keyidx == -1) { @@ -3257,21 +3277,6 @@ static void llm_load_vocab( vocab.special_unk_id = -1; vocab.special_sep_id = -1; vocab.special_pad_id = -1; - } else if (tokenizer_name == "bert") { - vocab.type = LLAMA_VOCAB_TYPE_WPM; - - // default special tokens - vocab.special_bos_id = 101; - vocab.special_eos_id = 102; - vocab.special_unk_id = 100; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.add_space_prefix = false; - } else { - LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); - LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); - - vocab.type = LLAMA_VOCAB_TYPE_SPM; } } @@ -4367,7 +4372,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam llm_load_arch (ml, model); llm_load_hparams(ml, model); llm_load_vocab (ml, model); - llm_load_print_meta(ml, model); if (model.hparams.n_vocab != model.vocab.id_to_token.size()) { @@ -7679,6 +7683,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { auto buf = token_data.text.substr(3, 2); return strtol(buf.c_str(), NULL, 16); } + case LLAMA_VOCAB_TYPE_DEEPSEEKCODER: case LLAMA_VOCAB_TYPE_BPE: { GGML_ASSERT(false); return unicode_to_bytes_bpe(token_data.text); @@ -7699,6 +7704,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { return vocab.token_to_id.at(buf); } case LLAMA_VOCAB_TYPE_WPM: + case LLAMA_VOCAB_TYPE_DEEPSEEKCODER: case LLAMA_VOCAB_TYPE_BPE: { return vocab.token_to_id.at(bytes_to_unicode_bpe(ch)); } @@ -7895,7 +7901,21 @@ struct llm_tokenizer_bpe { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; - auto word_collection = bpe_gpt2_preprocess(text); + + std::vector word_collection; + switch (vocab.type) { + case LLAMA_VOCAB_TYPE_BPE: + word_collection = bpe_gpt2_preprocess(text); + break; + case LLAMA_VOCAB_TYPE_DEEPSEEKCODER: + word_collection = bpe_deepseek_coder_preprocess(text); + break; + case LLAMA_VOCAB_TYPE_DEEPSEEKLLM: + word_collection = bpe_deepseek_llm_preprocess(text); + break; + default: + break; + } symbols_final.clear(); @@ -8022,143 +8042,81 @@ struct llm_tokenizer_bpe { work_queue.push(bigram); } - std::vector bpe_gpt2_preprocess(const std::string & text) { - std::vector bpe_words; - std::vector bpe_encoded_words; - - std::string token = ""; - // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ - bool collecting_numeric = false; - bool collecting_letter = false; - bool collecting_special = false; - bool collecting_whitespace_lookahead = false; - bool collecting = false; - - std::vector text_utf; - text_utf.reserve(text.size()); - bpe_words.reserve(text.size()); - bpe_encoded_words.reserve(text.size()); - - auto cps = codepoints_from_utf8(text); - for (size_t i = 0; i < cps.size(); ++i) - text_utf.emplace_back(codepoint_to_utf8(cps[i])); - - for (int i = 0; i < (int)text_utf.size(); i++) { - const std::string & utf_char = text_utf[i]; - bool split_condition = false; - int bytes_remain = text_utf.size() - i; - // forward backward lookups - const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; - const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; - - // handling contractions - if (!split_condition && bytes_remain >= 2) { - // 's|'t|'m|'d - if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) { - split_condition = true; - } - if (split_condition) { - if (token.size()) { - bpe_words.emplace_back(token); // push previous content as token - } - token = utf_char + utf_char_next; - bpe_words.emplace_back(token); - token = ""; - i++; - continue; - } - } - if (!split_condition && bytes_remain >= 3) { - // 're|'ve|'ll - if (utf_char == "\'" && ( - (utf_char_next == "r" && utf_char_next_next == "e") || - (utf_char_next == "v" && utf_char_next_next == "e") || - (utf_char_next == "l" && utf_char_next_next == "l")) - ) { - split_condition = true; - } - if (split_condition) { - // current token + next token can be defined - if (token.size()) { - bpe_words.emplace_back(token); // push previous content as token - } - token = utf_char + utf_char_next + utf_char_next_next; - bpe_words.emplace_back(token); // the contraction - token = ""; - i += 2; - continue; - } + std::vector byte_encoding_process(const std::vector & bpe_words) { + std::vectorbpe_encoded_words; + for (auto word : bpe_words) { + std::string text_utf = ""; + auto utf_word = codepoints_from_utf8(word); + for (size_t i = 0; i < utf_word.size(); ++i) + text_utf += codepoint_to_utf8(utf_word[i]); + + std::string encoded_token = ""; + for (char & c : text_utf) { + encoded_token += bytes_to_unicode_bpe(c); } + bpe_encoded_words.emplace_back(encoded_token); + } + return bpe_encoded_words; + } - if (!split_condition && !collecting) { - if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) { - collecting_letter = true; - collecting = true; - } - else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { - collecting_numeric = true; - collecting = true; - } - else if ( - ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) || - (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) - ) { - collecting_special = true; - collecting = true; - } - else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) { - collecting_whitespace_lookahead = true; - collecting = true; - } - else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) { - split_condition = true; + std::vector regex_preprocess(const std::wstring & text, const std::vector & offsets, const std::wstring & regex_expr) { + std::wregex expr(regex_expr); + std::vector bpe_words; // stroe the offset of each word + bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size + size_t start = 0; + for (auto offset : offsets) { + std::wcregex_iterator it(text.data() + start, text.data() + start + offset, expr); + std::wcregex_iterator end; + + int64_t start_idx = 0; + while (it != end) { + std::wcmatch match = *it; + if (match.position() > start_idx) { + bpe_words.emplace_back(match.position() - start_idx); } + bpe_words.emplace_back(match.length()); + start_idx = match.position() + match.length(); + ++it; } - else if (!split_condition && collecting) { - if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) { - split_condition = true; - } - else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) { - split_condition = true; - } - else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { - split_condition = true; - } - else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { - split_condition = true; - } + if (start_idx < (int64_t) offset) { + bpe_words.emplace_back(offset - start_idx); } + start += offset; + } - if (utf_char_next == "") { - split_condition = true; // final - token += utf_char; - } + return bpe_words; + } - if (split_condition) { - if (token.size()) { - bpe_words.emplace_back(token); - } - token = utf_char; - collecting = false; - collecting_letter = false; - collecting_numeric = false; - collecting_special = false; - collecting_whitespace_lookahead = false; - } - else { - token += utf_char; - } + std::vector regex_bpe_preprocess(const std::string & text, const std::vector & regex_exprs) { + std::wstring wtext = from_utf8(text); + + std::vector bpe_offsets = {wtext.size()}; + + for(auto & regex_expr : regex_exprs) { + bpe_offsets = regex_preprocess(wtext, bpe_offsets, regex_expr); } - for (std::string & word : bpe_words) { - std::string encoded_token = ""; - for (char & c : word) { - encoded_token += bytes_to_unicode_bpe(c); - } - bpe_encoded_words.emplace_back(encoded_token); + std::vector bpe_words; + bpe_words.reserve(bpe_offsets.size()); // Reserve memory for the approximate size + size_t start = 0; + for(size_t & offset : bpe_offsets){ + bpe_words.emplace_back(to_utf8(std::wstring(wtext, start, offset))); + start += offset; } - return bpe_encoded_words; + return byte_encoding_process(bpe_words); + } + + std::vector bpe_gpt2_preprocess(const std::string & text) { + return regex_bpe_preprocess(text, gpt2_regex); + } + + std::vector bpe_deepseek_coder_preprocess(const std::string & text) { + return regex_bpe_preprocess(text, deepseek_coder_regex); + } + + std::vector bpe_deepseek_llm_preprocess(const std::string & text) { + return regex_bpe_preprocess(text, deepseek_llm_regex); } const llama_vocab & vocab; @@ -8548,6 +8506,8 @@ static std::vector llama_tokenize_internal(const llama_vocab & } } } break; + case LLAMA_VOCAB_TYPE_DEEPSEEKCODER: + case LLAMA_VOCAB_TYPE_DEEPSEEKLLM: case LLAMA_VOCAB_TYPE_BPE: { for (const auto & fragment: fragment_buffer) { @@ -12228,6 +12188,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token } break; } + case LLAMA_VOCAB_TYPE_DEEPSEEKCODER: + case LLAMA_VOCAB_TYPE_DEEPSEEKLLM: case LLAMA_VOCAB_TYPE_BPE: { // NOTE: we accept all unsupported token types, // suppressing them like CONTROL tokens. diff --git a/llama.h b/llama.h index 367e8f1a105a5..5efe6cc17c002 100644 --- a/llama.h +++ b/llama.h @@ -59,9 +59,11 @@ extern "C" { typedef int32_t llama_seq_id; enum llama_vocab_type { - LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece - LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding - LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece + LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece + LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding + LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece + LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 3, // Deepseek Coder + LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 4, // Deepseek LLM }; enum llama_token_type { diff --git a/models/ggml-vocab-deepseek-coder.gguf b/models/ggml-vocab-deepseek-coder.gguf new file mode 100644 index 0000000000000..2531e1e1914ef Binary files /dev/null and b/models/ggml-vocab-deepseek-coder.gguf differ diff --git a/models/ggml-vocab-deepseek-llm.gguf b/models/ggml-vocab-deepseek-llm.gguf new file mode 100644 index 0000000000000..8fed82fa0ba1b Binary files /dev/null and b/models/ggml-vocab-deepseek-llm.gguf differ diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3e40a78cdeac9..1482f6cec558a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -29,12 +29,17 @@ llama_build_and_test_executable(test-quantize-fns.cpp) llama_build_and_test_executable(test-quantize-perf.cpp) llama_build_and_test_executable(test-sampling.cpp) + llama_build_executable(test-tokenizer-0-llama.cpp) llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) + llama_build_executable(test-tokenizer-0-falcon.cpp) llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) +llama_build_executable(test-tokenizer-0-deepseek-coder.cpp) +llama_test_executable (test-tokenizer-0-deepseek-coder test-tokenizer-0-deepseek-coder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) + llama_build_executable(test-tokenizer-1-llama.cpp) llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) llama_test_executable (test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) diff --git a/tests/test-tokenizer-0-deepseek-coder.cpp b/tests/test-tokenizer-0-deepseek-coder.cpp new file mode 100644 index 0000000000000..16966e0726b33 --- /dev/null +++ b/tests/test-tokenizer-0-deepseek-coder.cpp @@ -0,0 +1,188 @@ +#include "llama.h" +#include "common.h" +#include "console.h" + +#include +#include +#include +#include +#include + +// generate using test-tokenizer-0-falcon.py +static const std::map> & k_tests() { + static std::map> _k_tests = { + { "" , { }, }, + { " " , { 207, }, }, + { " " , { 243, }, }, + { " " , { 315, }, }, + { "\t" , { 184, }, }, + { "\n" , { 185, }, }, + { "\t\n" , { 184, 185, }, }, + { "Hello world" , { 17535, 1835, }, }, + { " Hello world" , { 414, 9489, 1835, }, }, + { "Hello World" , { 17535, 5414, }, }, + { " Hello World" , { 414, 9489, 5414, }, }, + { " Hello World!" , { 414, 9489, 5414, 0, }, }, + { "Hello, world!" , { 17535, 11, 1835, 0, }, }, + { " Hello, world!" , { 414, 9489, 11, 1835, 0, }, }, + { " this is 🦙.cpp" , { 437, 317, 12394, 99, 234, 13, 14789, }, }, + { "w048 7tuijk dsdfhu" , { 86, 15, 19, 23, 207, 22, 83, 3963, 27659, 26078, 3934, 14072, }, }, + { "нещо на Български" , { 1593, 6478, 616, 2251, 14994, }, }, + { "កាន់តែពិសេសអាចខលចេញ" , { 155, 239, 209, 155, 239, 114, 155, 239, 228, 155, 240, 220, 155, 239, 224, 155, 240, 211, 155, 239, 231, 155, 239, 115, 155, 239, 240, 155, 240, 210, 155, 239, 240, 155, 239, 95, 155, 239, 114, 155, 239, 214, 155, 239, 210, 155, 239, 236, 155, 239, 214, 155, 240, 210, 155, 239, 218, }, }, + { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 10047, 235, 209, 334, 8760, 8, 12394, 233, 114, 350, 222, 10047, 221, 104, 169, 116, 224, 334, 4684, 3909, 992, 24330, 262, 29651, 612, 8, 207, 156, 237, 214, 334, 5950, 992, 78, 12896, 344, 638, 891, 1372, 10736, 8, }, }, + { "Hello" , { 17535, }, }, + { " Hello" , { 414, 9489, }, }, + { " Hello" , { 207, 414, 9489, }, }, + { " Hello" , { 243, 414, 9489, }, }, + { " Hello" , { 315, 414, 9489, }, }, + { " Hello\n Hello" , { 315, 414, 9489, 185, 315, 414, 9489, }, }, + { "\n =" , { 185, 405, }, }, + { "' era" , { 6, 2895, }, }, + { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", { 17535, 11, 320, 6, 435, 0, 1717, 417, 340, 12394, 233, 210, 3015, 19100, 608, 9413, 2668, 16, 18, 16, 19, 16, 20, 16, 1393, 169, 121, 239, }, }, + + }; + + return _k_tests; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(false); + + // load the vocab + { + auto mparams = llama_model_default_params(); + + mparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), mparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + auto cparams = llama_context_default_params(); + + ctx = llama_new_context_with_model(model, cparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + + if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_DEEPSEEKCODER) { + fprintf(stderr, "%s : error: vocab type is not DEEPSEEKCODER\n", __func__); + llama_free_model(model); + llama_free(ctx); + return 2; + } + +#ifdef _WIN32 + // We need this for unicode console support + console::init(false, false); + atexit([]() { console::cleanup(); }); +#endif + + bool success = true; + + for (const auto & test_kv : k_tests()) { + const std::vector res = llama_tokenize(ctx, test_kv.first, false); + + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str()); + printf("tok: "); + for (const auto & tok : res) { + printf("%d ", tok); + } + printf("\n"); + + bool correct = res.size() == test_kv.second.size(); + for (int i = 0; i < (int) res.size() && correct; ++i) { + if (test_kv.second[i] != res[i]) { + correct = false; + } + } + + if (!correct) { + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + llama_detokenize_bpe(ctx, res).c_str(), + llama_detokenize_bpe(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : expected tokens: ", __func__); + for (const auto & t : test_kv.second) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : got tokens: ", __func__); + for (const auto & t : res) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + + success = false; + } + } + + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + const std::vector res = llama_tokenize(ctx, text, false); + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector{tok}) << "'" << std::endl; + } + } + + fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + return success ? 0 : 3; +} diff --git a/tests/test-tokenizer-0-deepseek-coder.py b/tests/test-tokenizer-0-deepseek-coder.py new file mode 100644 index 0000000000000..b99840e1b9191 --- /dev/null +++ b/tests/test-tokenizer-0-deepseek-coder.py @@ -0,0 +1,83 @@ +# tests with BPE tokenizer + +import argparse + +from transformers import AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize") +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer + +tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + +tests = [ + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is 🦙.cpp", + "w048 7tuijk dsdfhu", + "нещо на Български", + "កាន់តែពិសេសអាចខលចេញ", + "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + "\n =", + "' era", + "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", +] + +for text in tests: + print('text: ', text) + print(tokenizer.encode(text)) + print(tokenizer.decode(tokenizer.encode(text))) + +print("\n\ntests for C++:\n") +for text in tests: + res = tokenizer.encode(text) + + k = text.replace('\n', '\\n') + k = k.replace('\t', '\\t') + k = '"' + k + '"' + print("{ %-24s, { " % k, end='') + for x in res: + print("%7d," % x, end='') + print(" }, },") + +print(tokenizer.encode('hello')) +print(tokenizer.encode('world')) +print(tokenizer.encode(' world')) +print(tokenizer.encode('hello world')) + +fname_tok = args.fname_tok +if fname_tok: + print('tokenizing file: ', fname_tok) + fname_out = fname_tok + '.tok' + with open(fname_tok, 'r', encoding='utf-8') as f: + lines = f.readlines() + s = ''.join(lines) + res = tokenizer.encode(s) + # write to file + with open(fname_out, 'w', encoding='utf-8') as f: + for x in res: + f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) + print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-deepseek-llm.cpp b/tests/test-tokenizer-0-deepseek-llm.cpp new file mode 100644 index 0000000000000..98d628615fe94 --- /dev/null +++ b/tests/test-tokenizer-0-deepseek-llm.cpp @@ -0,0 +1,188 @@ +#include "llama.h" +#include "common.h" +#include "console.h" + +#include +#include +#include +#include +#include + +// generate using test-tokenizer-0-falcon.py +static const std::map> & k_tests() { + static std::map> _k_tests = { + { "" , { }, }, + { " " , { 207, }, }, + { " " , { 243, }, }, + { " " , { 300, }, }, + { "\t" , { 184, }, }, + { "\n" , { 185, }, }, + { "\t\n" , { 184, 185, }, }, + { "Hello world" , { 17464, 1843, }, }, + { " Hello world" , { 37727, 1843, }, }, + { "Hello World" , { 17464, 5427, }, }, + { " Hello World" , { 37727, 5427, }, }, + { " Hello World!" , { 37727, 5427, 0, }, }, + { "Hello, world!" , { 17464, 11, 1843, 0, }, }, + { " Hello, world!" , { 37727, 11, 1843, 0, }, }, + { " this is 🦙.cpp" , { 437, 317, 12356, 99, 234, 13, 14743, }, }, + { "w048 7tuijk dsdfhu" , { 86, 15, 19, 23, 207, 22, 83, 3970, 27519, 26016, 3944, 14025, }, }, + { "нещо на Български" , { 1603, 6476, 620, 91754, }, }, + { "កាន់តែពិសេសអាចខលចេញ" , { 71374, 209, 71374, 114, 71374, 228, 155, 240, 220, 71374, 224, 155, 240, 211, 71374, 231, 71374, 115, 71374, 240, 155, 240, 210, 71374, 240, 71374, 95, 71374, 114, 71374, 214, 71374, 210, 71374, 236, 71374, 214, 155, 240, 210, 71374, 218, }, }, + { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 10044, 95300, 334, 8754, 8, 33701, 114, 350, 222, 10044, 221, 104, 46713, 334, 34732, 996, 24250, 262, 80923, 8, 207, 37103, 214, 334, 5956, 89213, 344, 643, 895, 1377, 10728, 8, }, }, + { "Hello" , { 17464, }, }, + { " Hello" , { 37727, }, }, + { " Hello" , { 207, 37727, }, }, + { " Hello" , { 243, 37727, }, }, + { " Hello" , { 300, 37727, }, }, + { " Hello\n Hello" , { 300, 37727, 185, 300, 37727, }, }, + { "\n =" , { 185, 403, }, }, + { "' era" , { 6, 2906, }, }, + { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", { 17464, 11, 320, 6, 436, 0, 1724, 418, 340, 33701, 210, 3025, 19017, 612, 9407, 2681, 16, 18, 16, 19, 16, 20, 16, 1398, 68940, 239, }, }, + + }; + + return _k_tests; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(false); + + // load the vocab + { + auto mparams = llama_model_default_params(); + + mparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), mparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + auto cparams = llama_context_default_params(); + + ctx = llama_new_context_with_model(model, cparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + + if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_DEEPSEEKLLM) { + fprintf(stderr, "%s : error: vocab type is not DEEPSEEKLLM\n", __func__); + llama_free_model(model); + llama_free(ctx); + return 2; + } + +#ifdef _WIN32 + // We need this for unicode console support + console::init(false, false); + atexit([]() { console::cleanup(); }); +#endif + + bool success = true; + + for (const auto & test_kv : k_tests()) { + const std::vector res = llama_tokenize(ctx, test_kv.first, false); + + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str()); + printf("tok: "); + for (const auto & tok : res) { + printf("%d ", tok); + } + printf("\n"); + + bool correct = res.size() == test_kv.second.size(); + for (int i = 0; i < (int) res.size() && correct; ++i) { + if (test_kv.second[i] != res[i]) { + correct = false; + } + } + + if (!correct) { + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + llama_detokenize_bpe(ctx, res).c_str(), + llama_detokenize_bpe(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : expected tokens: ", __func__); + for (const auto & t : test_kv.second) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : got tokens: ", __func__); + for (const auto & t : res) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + + success = false; + } + } + + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + const std::vector res = llama_tokenize(ctx, text, false); + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector{tok}) << "'" << std::endl; + } + } + + fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + return success ? 0 : 3; +} diff --git a/tests/test-tokenizer-0-deepseek-llm.py b/tests/test-tokenizer-0-deepseek-llm.py new file mode 100644 index 0000000000000..b99840e1b9191 --- /dev/null +++ b/tests/test-tokenizer-0-deepseek-llm.py @@ -0,0 +1,83 @@ +# tests with BPE tokenizer + +import argparse + +from transformers import AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize") +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer + +tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + +tests = [ + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is 🦙.cpp", + "w048 7tuijk dsdfhu", + "нещо на Български", + "កាន់តែពិសេសអាចខលចេញ", + "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + "\n =", + "' era", + "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", +] + +for text in tests: + print('text: ', text) + print(tokenizer.encode(text)) + print(tokenizer.decode(tokenizer.encode(text))) + +print("\n\ntests for C++:\n") +for text in tests: + res = tokenizer.encode(text) + + k = text.replace('\n', '\\n') + k = k.replace('\t', '\\t') + k = '"' + k + '"' + print("{ %-24s, { " % k, end='') + for x in res: + print("%7d," % x, end='') + print(" }, },") + +print(tokenizer.encode('hello')) +print(tokenizer.encode('world')) +print(tokenizer.encode(' world')) +print(tokenizer.encode('hello world')) + +fname_tok = args.fname_tok +if fname_tok: + print('tokenizing file: ', fname_tok) + fname_out = fname_tok + '.tok' + with open(fname_tok, 'r', encoding='utf-8') as f: + lines = f.readlines() + s = ''.join(lines) + res = tokenizer.encode(s) + # write to file + with open(fname_out, 'w', encoding='utf-8') as f: + for x in res: + f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) + print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp index a4e9d2b912728..677608decef0a 100644 --- a/tests/test-tokenizer-0-falcon.cpp +++ b/tests/test-tokenizer-0-falcon.cpp @@ -38,6 +38,7 @@ static const std::map> & k_tests() { { " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, { "\n =" , { 1212, 40, }, }, { "' era" , { 18, 4932, }, }, + { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", { 9856, 23, 291, 18, 436, 12, 1265, 362, 299, 8196, 207, 204, 42, 50087, 123, 2727, 20300, 32022, 133, 234, 17419, 30137, 28, 7858, 181, 133, 236, }, }, }; return _k_tests; @@ -115,7 +116,6 @@ int main(int argc, char **argv) { printf("\n"); bool correct = res.size() == test_kv.second.size(); - for (int i = 0; i < (int) res.size() && correct; ++i) { if (test_kv.second[i] != res[i]) { correct = false; diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py index 4f06ec9bbba5b..b99840e1b9191 100644 --- a/tests/test-tokenizer-0-falcon.py +++ b/tests/test-tokenizer-0-falcon.py @@ -41,6 +41,7 @@ " Hello\n Hello", "\n =", "' era", + "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", ] for text in tests: diff --git a/unicode.h b/unicode.h index 844eff3dad1b3..6f175b842fa97 100644 --- a/unicode.h +++ b/unicode.h @@ -1,10 +1,13 @@ -#pragma once +#pragma once #include #include #include #include #include +#include +#include +#include static const std::vector> digit_ranges = { {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F}, @@ -461,3 +464,48 @@ static uint8_t unicode_to_bytes_bpe(const std::string & utf8) { return map.at(utf8); } +static const std::vector gpt2_regex = { + // //punc: \{p} and ascii puncs + L"[\U00000021-\U0000002F\U0000003A-\U00000040\\\U0000005B-\U00000060\U0000007B-\U0000007E\U000000A1-\U000000A1\U000000A7-\U000000A7\U000000AB-\U000000AB\U000000B6-\U000000B7\U000000BB-\U000000BB\U000000BF-\U000000BF\U0000037E-\U0000037E\U00000387-\U00000387\U0000055A-\U0000055F\U00000589-\U0000058A\U000005BE-\U000005BE\U000005C0-\U000005C0\U000005C3-\U000005C3\U000005C6-\U000005C6\U000005F3-\U000005F4\U00000609-\U0000060A\U0000060C-\U0000060D\U0000061B-\U0000061B\U0000061E-\U0000061F\U0000066A-\U0000066D\U000006D4-\U000006D4\U00000700-\U0000070D\U000007F7-\U000007F9\U00000830-\U0000083E\U0000085E-\U0000085E\U00000964-\U00000965\U00000970-\U00000970\U000009FD-\U000009FD\U00000A76-\U00000A76\U00000AF0-\U00000AF0\U00000C77-\U00000C77\U00000C84-\U00000C84\U00000DF4-\U00000DF4\U00000E4F-\U00000E4F\U00000E5A-\U00000E5B\U00000F04-\U00000F12\U00000F14-\U00000F14\U00000F3A-\U00000F3D\U00000F85-\U00000F85\U00000FD0-\U00000FD4\U00000FD9-\U00000FDA\U0000104A-\U0000104F\U000010FB-\U000010FB\U00001360-\U00001368\U00001400-\U00001400\U0000166E-\U0000166E\U0000169B-\U0000169C\U000016EB-\U000016ED\U00001735-\U00001736\U000017D4-\U000017D6\U000017D8-\U000017DA\U00001800-\U0000180A\U00001944-\U00001945\U00001A1E-\U00001A1F\U00001AA0-\U00001AA6\U00001AA8-\U00001AAD\U00001B5A-\U00001B60\U00001BFC-\U00001BFF\U00001C3B-\U00001C3F\U00001C7E-\U00001C7F\U00001CC0-\U00001CC7\U00001CD3-\U00001CD3\U00002010-\U00002027\U00002030-\U00002043\U00002045-\U00002051\U00002053-\U0000205E\U0000207D-\U0000207E\U0000208D-\U0000208E\U00002308-\U0000230B\U00002329-\U0000232A\U00002768-\U00002775\U000027C5-\U000027C6\U000027E6-\U000027EF\U00002983-\U00002998\U000029D8-\U000029DB\U000029FC-\U000029FD\U00002CF9-\U00002CFC\U00002CFE-\U00002CFF\U00002D70-\U00002D70\U00002E00-\U00002E2E\U00002E30-\U00002E4F\U00002E52-\U00002E52\U00003001-\U00003003\U00003008-\U00003011\U00003014-\U0000301F\U00003030-\U00003030\U0000303D-\U0000303D\U000030A0-\U000030A0\U000030FB-\U000030FB\U0000A4FE-\U0000A4FF\U0000A60D-\U0000A60F\U0000A673-\U0000A673\U0000A67E-\U0000A67E\U0000A6F2-\U0000A6F7\U0000A874-\U0000A877\U0000A8CE-\U0000A8CF\U0000A8F8-\U0000A8FA\U0000A8FC-\U0000A8FC\U0000A92E-\U0000A92F\U0000A95F-\U0000A95F\U0000A9C1-\U0000A9CD\U0000A9DE-\U0000A9DF\U0000AA5C-\U0000AA5F\U0000AADE-\U0000AADF\U0000AAF0-\U0000AAF1\U0000ABEB-\U0000ABEB\U0000FD3E-\U0000FD3F\U0000FE10-\U0000FE19\U0000FE30-\U0000FE52\U0000FE54-\U0000FE61\U0000FE63-\U0000FE63\U0000FE68-\U0000FE68\U0000FE6A-\U0000FE6B\U0000FF01-\U0000FF03\U0000FF05-\U0000FF0A\U0000FF0C-\U0000FF0F\U0000FF1A-\U0000FF1B\U0000FF1F-\U0000FF20\U0000FF3B-\U0000FF3D\U0000FF3F-\U0000FF3F\U0000FF5B-\U0000FF5B\U0000FF5D-\U0000FF5D\U0000FF5F-\U0000FF65\U00010100-\U00010102\U0001039F-\U0001039F\U000103D0-\U000103D0\U0001056F-\U0001056F\U00010857-\U00010857\U0001091F-\U0001091F\U0001093F-\U0001093F\U00010A50-\U00010A58\U00010A7F-\U00010A7F\U00010AF0-\U00010AF6\U00010B39-\U00010B3F\U00010B99-\U00010B9C\U00010EAD-\U00010EAD\U00010F55-\U00010F59\U00011047-\U0001104D\U000110BB-\U000110BC\U000110BE-\U000110C1\U00011140-\U00011143\U00011174-\U00011175\U000111C5-\U000111C8\U000111CD-\U000111CD\U000111DB-\U000111DB\U000111DD-\U000111DF\U00011238-\U0001123D\U000112A9-\U000112A9\U0001144B-\U0001144F\U0001145A-\U0001145B\U0001145D-\U0001145D\U000114C6-\U000114C6\U000115C1-\U000115D7\U00011641-\U00011643\U00011660-\U0001166C\U0001173C-\U0001173E\U0001183B-\U0001183B\U00011944-\U00011946\U000119E2-\U000119E2\U00011A3F-\U00011A46\U00011A9A-\U00011A9C\U00011A9E-\U00011AA2\U00011C41-\U00011C45\U00011C70-\U00011C71\U00011EF7-\U00011EF8\U00011FFF-\U00011FFF\U00012470-\U00012474\U00016A6E-\U00016A6F\U00016AF5-\U00016AF5\U00016B37-\U00016B3B\U00016B44-\U00016B44\U00016E97-\U00016E9A\U00016FE2-\U00016FE2\U0001BC9F-\U0001BC9F\U0001DA87-\U0001DA8B\U0001E95E-\U0001E95F]+", + // //'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S) + L"'s|'t|'re|'ve|'m|'ll|'d| ?[\U00000041-\U0000005A\U00000061-\U0000007A\U000000AA-\U000000AA\U000000B5-\U000000B5\U000000BA-\U000000BA\U000000C0-\U000000D6\U000000D8-\U000000F6\U000000F8-\U000002C1\U000002C6-\U000002D1\U000002E0-\U000002E4\U000002EC-\U000002EC\U000002EE-\U000002EE\U00000370-\U00000374\U00000376-\U00000377\U0000037A-\U0000037D\U0000037F-\U0000037F\U00000386-\U00000386\U00000388-\U0000038A\U0000038C-\U0000038C\U0000038E-\U000003A1\U000003A3-\U000003F5\U000003F7-\U00000481\U0000048A-\U0000052F\U00000531-\U00000556\U00000559-\U00000559\U00000560-\U00000588\U000005D0-\U000005EA\U000005EF-\U000005F2\U00000620-\U0000064A\U0000066E-\U0000066F\U00000671-\U000006D3\U000006D5-\U000006D5\U000006E5-\U000006E6\U000006EE-\U000006EF\U000006FA-\U000006FC\U000006FF-\U000006FF\U00000710-\U00000710\U00000712-\U0000072F\U0000074D-\U000007A5\U000007B1-\U000007B1\U000007CA-\U000007EA\U000007F4-\U000007F5\U000007FA-\U000007FA\U00000800-\U00000815\U0000081A-\U0000081A\U00000824-\U00000824\U00000828-\U00000828\U00000840-\U00000858\U00000860-\U0000086A\U000008A0-\U000008B4\U000008B6-\U000008C7\U00000904-\U00000939\U0000093D-\U0000093D\U00000950-\U00000950\U00000958-\U00000961\U00000971-\U00000980\U00000985-\U0000098C\U0000098F-\U00000990\U00000993-\U000009A8\U000009AA-\U000009B0\U000009B2-\U000009B2\U000009B6-\U000009B9\U000009BD-\U000009BD\U000009CE-\U000009CE\U000009DC-\U000009DD\U000009DF-\U000009E1\U000009F0-\U000009F1\U000009FC-\U000009FC\U00000A05-\U00000A0A\U00000A0F-\U00000A10\U00000A13-\U00000A28\U00000A2A-\U00000A30\U00000A32-\U00000A33\U00000A35-\U00000A36\U00000A38-\U00000A39\U00000A59-\U00000A5C\U00000A5E-\U00000A5E\U00000A72-\U00000A74\U00000A85-\U00000A8D\U00000A8F-\U00000A91\U00000A93-\U00000AA8\U00000AAA-\U00000AB0\U00000AB2-\U00000AB3\U00000AB5-\U00000AB9\U00000ABD-\U00000ABD\U00000AD0-\U00000AD0\U00000AE0-\U00000AE1\U00000AF9-\U00000AF9\U00000B05-\U00000B0C\U00000B0F-\U00000B10\U00000B13-\U00000B28\U00000B2A-\U00000B30\U00000B32-\U00000B33\U00000B35-\U00000B39\U00000B3D-\U00000B3D\U00000B5C-\U00000B5D\U00000B5F-\U00000B61\U00000B71-\U00000B71\U00000B83-\U00000B83\U00000B85-\U00000B8A\U00000B8E-\U00000B90\U00000B92-\U00000B95\U00000B99-\U00000B9A\U00000B9C-\U00000B9C\U00000B9E-\U00000B9F\U00000BA3-\U00000BA4\U00000BA8-\U00000BAA\U00000BAE-\U00000BB9\U00000BD0-\U00000BD0\U00000C05-\U00000C0C\U00000C0E-\U00000C10\U00000C12-\U00000C28\U00000C2A-\U00000C39\U00000C3D-\U00000C3D\U00000C58-\U00000C5A\U00000C60-\U00000C61\U00000C80-\U00000C80\U00000C85-\U00000C8C\U00000C8E-\U00000C90\U00000C92-\U00000CA8\U00000CAA-\U00000CB3\U00000CB5-\U00000CB9\U00000CBD-\U00000CBD\U00000CDE-\U00000CDE\U00000CE0-\U00000CE1\U00000CF1-\U00000CF2\U00000D04-\U00000D0C\U00000D0E-\U00000D10\U00000D12-\U00000D3A\U00000D3D-\U00000D3D\U00000D4E-\U00000D4E\U00000D54-\U00000D56\U00000D5F-\U00000D61\U00000D7A-\U00000D7F\U00000D85-\U00000D96\U00000D9A-\U00000DB1\U00000DB3-\U00000DBB\U00000DBD-\U00000DBD\U00000DC0-\U00000DC6\U00000E01-\U00000E30\U00000E32-\U00000E33\U00000E40-\U00000E46\U00000E81-\U00000E82\U00000E84-\U00000E84\U00000E86-\U00000E8A\U00000E8C-\U00000EA3\U00000EA5-\U00000EA5\U00000EA7-\U00000EB0\U00000EB2-\U00000EB3\U00000EBD-\U00000EBD\U00000EC0-\U00000EC4\U00000EC6-\U00000EC6\U00000EDC-\U00000EDF\U00000F00-\U00000F00\U00000F40-\U00000F47\U00000F49-\U00000F6C\U00000F88-\U00000F8C\U00001000-\U0000102A\U0000103F-\U0000103F\U00001050-\U00001055\U0000105A-\U0000105D\U00001061-\U00001061\U00001065-\U00001066\U0000106E-\U00001070\U00001075-\U00001081\U0000108E-\U0000108E\U000010A0-\U000010C5\U000010C7-\U000010C7\U000010CD-\U000010CD\U000010D0-\U000010FA\U000010FC-\U00001248\U0000124A-\U0000124D\U00001250-\U00001256\U00001258-\U00001258\U0000125A-\U0000125D\U00001260-\U00001288\U0000128A-\U0000128D\U00001290-\U000012B0\U000012B2-\U000012B5\U000012B8-\U000012BE\U000012C0-\U000012C0\U000012C2-\U000012C5\U000012C8-\U000012D6\U000012D8-\U00001310\U00001312-\U00001315\U00001318-\U0000135A\U00001380-\U0000138F\U000013A0-\U000013F5\U000013F8-\U000013FD\U00001401-\U0000166C\U0000166F-\U0000167F\U00001681-\U0000169A\U000016A0-\U000016EA\U000016F1-\U000016F8\U00001700-\U0000170C\U0000170E-\U00001711\U00001720-\U00001731\U00001740-\U00001751\U00001760-\U0000176C\U0000176E-\U00001770\U00001780-\U000017B3\U000017D7-\U000017D7\U000017DC-\U000017DC\U00001820-\U00001878\U00001880-\U00001884\U00001887-\U000018A8\U000018AA-\U000018AA\U000018B0-\U000018F5\U00001900-\U0000191E\U00001950-\U0000196D\U00001970-\U00001974\U00001980-\U000019AB\U000019B0-\U000019C9\U00001A00-\U00001A16\U00001A20-\U00001A54\U00001AA7-\U00001AA7\U00001B05-\U00001B33\U00001B45-\U00001B4B\U00001B83-\U00001BA0\U00001BAE-\U00001BAF\U00001BBA-\U00001BE5\U00001C00-\U00001C23\U00001C4D-\U00001C4F\U00001C5A-\U00001C7D\U00001C80-\U00001C88\U00001C90-\U00001CBA\U00001CBD-\U00001CBF\U00001CE9-\U00001CEC\U00001CEE-\U00001CF3\U00001CF5-\U00001CF6\U00001CFA-\U00001CFA\U00001D00-\U00001DBF\U00001E00-\U00001F15\U00001F18-\U00001F1D\U00001F20-\U00001F45\U00001F48-\U00001F4D\U00001F50-\U00001F57\U00001F59-\U00001F59\U00001F5B-\U00001F5B\U00001F5D-\U00001F5D\U00001F5F-\U00001F7D\U00001F80-\U00001FB4\U00001FB6-\U00001FBC\U00001FBE-\U00001FBE\U00001FC2-\U00001FC4\U00001FC6-\U00001FCC\U00001FD0-\U00001FD3\U00001FD6-\U00001FDB\U00001FE0-\U00001FEC\U00001FF2-\U00001FF4\U00001FF6-\U00001FFC\U00002071-\U00002071\U0000207F-\U0000207F\U00002090-\U0000209C\U00002102-\U00002102\U00002107-\U00002107\U0000210A-\U00002113\U00002115-\U00002115\U00002119-\U0000211D\U00002124-\U00002124\U00002126-\U00002126\U00002128-\U00002128\U0000212A-\U0000212D\U0000212F-\U00002139\U0000213C-\U0000213F\U00002145-\U00002149\U0000214E-\U0000214E\U00002183-\U00002184\U00002C00-\U00002C2E\U00002C30-\U00002C5E\U00002C60-\U00002CE4\U00002CEB-\U00002CEE\U00002CF2-\U00002CF3\U00002D00-\U00002D25\U00002D27-\U00002D27\U00002D2D-\U00002D2D\U00002D30-\U00002D67\U00002D6F-\U00002D6F\U00002D80-\U00002D96\U00002DA0-\U00002DA6\U00002DA8-\U00002DAE\U00002DB0-\U00002DB6\U00002DB8-\U00002DBE\U00002DC0-\U00002DC6\U00002DC8-\U00002DCE\U00002DD0-\U00002DD6\U00002DD8-\U00002DDE\U00002E2F-\U00002E2F\U00003005-\U00003006\U00003031-\U00003035\U0000303B-\U0000303C\U00003041-\U00003096\U0000309D-\U0000309F\U000030A1-\U000030FA\U000030FC-\U000030FF\U00003105-\U0000312F\U00003131-\U0000318E\U000031A0-\U000031BF\U000031F0-\U000031FF\U00003400-\U00004DBF\U00004E00-\U00009FFC\U0000A000-\U0000A48C\U0000A4D0-\U0000A4FD\U0000A500-\U0000A60C\U0000A610-\U0000A61F\U0000A62A-\U0000A62B\U0000A640-\U0000A66E\U0000A67F-\U0000A69D\U0000A6A0-\U0000A6E5\U0000A717-\U0000A71F\U0000A722-\U0000A788\U0000A78B-\U0000A7BF\U0000A7C2-\U0000A7CA\U0000A7F5-\U0000A801\U0000A803-\U0000A805\U0000A807-\U0000A80A\U0000A80C-\U0000A822\U0000A840-\U0000A873\U0000A882-\U0000A8B3\U0000A8F2-\U0000A8F7\U0000A8FB-\U0000A8FB\U0000A8FD-\U0000A8FE\U0000A90A-\U0000A925\U0000A930-\U0000A946\U0000A960-\U0000A97C\U0000A984-\U0000A9B2\U0000A9CF-\U0000A9CF\U0000A9E0-\U0000A9E4\U0000A9E6-\U0000A9EF\U0000A9FA-\U0000A9FE\U0000AA00-\U0000AA28\U0000AA40-\U0000AA42\U0000AA44-\U0000AA4B\U0000AA60-\U0000AA76\U0000AA7A-\U0000AA7A\U0000AA7E-\U0000AAAF\U0000AAB1-\U0000AAB1\U0000AAB5-\U0000AAB6\U0000AAB9-\U0000AABD\U0000AAC0-\U0000AAC0\U0000AAC2-\U0000AAC2\U0000AADB-\U0000AADD\U0000AAE0-\U0000AAEA\U0000AAF2-\U0000AAF4\U0000AB01-\U0000AB06\U0000AB09-\U0000AB0E\U0000AB11-\U0000AB16\U0000AB20-\U0000AB26\U0000AB28-\U0000AB2E\U0000AB30-\U0000AB5A\U0000AB5C-\U0000AB69\U0000AB70-\U0000ABE2\U0000AC00-\U0000D7A3\U0000D7B0-\U0000D7C6\U0000D7CB-\U0000D7FB\U0000F900-\U0000FA6D\U0000FA70-\U0000FAD9\U0000FB00-\U0000FB06\U0000FB13-\U0000FB17\U0000FB1D-\U0000FB1D\U0000FB1F-\U0000FB28\U0000FB2A-\U0000FB36\U0000FB38-\U0000FB3C\U0000FB3E-\U0000FB3E\U0000FB40-\U0000FB41\U0000FB43-\U0000FB44\U0000FB46-\U0000FBB1\U0000FBD3-\U0000FD3D\U0000FD50-\U0000FD8F\U0000FD92-\U0000FDC7\U0000FDF0-\U0000FDFB\U0000FE70-\U0000FE74\U0000FE76-\U0000FEFC\U0000FF21-\U0000FF3A\U0000FF41-\U0000FF5A\U0000FF66-\U0000FFBE\U0000FFC2-\U0000FFC7\U0000FFCA-\U0000FFCF\U0000FFD2-\U0000FFD7\U0000FFDA-\U0000FFDC\U00010000-\U0001000B\U0001000D-\U00010026\U00010028-\U0001003A\U0001003C-\U0001003D\U0001003F-\U0001004D\U00010050-\U0001005D\U00010080-\U000100FA\U00010280-\U0001029C\U000102A0-\U000102D0\U00010300-\U0001031F\U0001032D-\U00010340\U00010342-\U00010349\U00010350-\U00010375\U00010380-\U0001039D\U000103A0-\U000103C3\U000103C8-\U000103CF\U00010400-\U0001049D\U000104B0-\U000104D3\U000104D8-\U000104FB\U00010500-\U00010527\U00010530-\U00010563\U00010600-\U00010736\U00010740-\U00010755\U00010760-\U00010767\U00010800-\U00010805\U00010808-\U00010808\U0001080A-\U00010835\U00010837-\U00010838\U0001083C-\U0001083C\U0001083F-\U00010855\U00010860-\U00010876\U00010880-\U0001089E\U000108E0-\U000108F2\U000108F4-\U000108F5\U00010900-\U00010915\U00010920-\U00010939\U00010980-\U000109B7\U000109BE-\U000109BF\U00010A00-\U00010A00\U00010A10-\U00010A13\U00010A15-\U00010A17\U00010A19-\U00010A35\U00010A60-\U00010A7C\U00010A80-\U00010A9C\U00010AC0-\U00010AC7\U00010AC9-\U00010AE4\U00010B00-\U00010B35\U00010B40-\U00010B55\U00010B60-\U00010B72\U00010B80-\U00010B91\U00010C00-\U00010C48\U00010C80-\U00010CB2\U00010CC0-\U00010CF2\U00010D00-\U00010D23\U00010E80-\U00010EA9\U00010EB0-\U00010EB1\U00010F00-\U00010F1C\U00010F27-\U00010F27\U00010F30-\U00010F45\U00010FB0-\U00010FC4\U00010FE0-\U00010FF6\U00011003-\U00011037\U00011083-\U000110AF\U000110D0-\U000110E8\U00011103-\U00011126\U00011144-\U00011144\U00011147-\U00011147\U00011150-\U00011172\U00011176-\U00011176\U00011183-\U000111B2\U000111C1-\U000111C4\U000111DA-\U000111DA\U000111DC-\U000111DC\U00011200-\U00011211\U00011213-\U0001122B\U00011280-\U00011286\U00011288-\U00011288\U0001128A-\U0001128D\U0001128F-\U0001129D\U0001129F-\U000112A8\U000112B0-\U000112DE\U00011305-\U0001130C\U0001130F-\U00011310\U00011313-\U00011328\U0001132A-\U00011330\U00011332-\U00011333\U00011335-\U00011339\U0001133D-\U0001133D\U00011350-\U00011350\U0001135D-\U00011361\U00011400-\U00011434\U00011447-\U0001144A\U0001145F-\U00011461\U00011480-\U000114AF\U000114C4-\U000114C5\U000114C7-\U000114C7\U00011580-\U000115AE\U000115D8-\U000115DB\U00011600-\U0001162F\U00011644-\U00011644\U00011680-\U000116AA\U000116B8-\U000116B8\U00011700-\U0001171A\U00011800-\U0001182B\U000118A0-\U000118DF\U000118FF-\U00011906\U00011909-\U00011909\U0001190C-\U00011913\U00011915-\U00011916\U00011918-\U0001192F\U0001193F-\U0001193F\U00011941-\U00011941\U000119A0-\U000119A7\U000119AA-\U000119D0\U000119E1-\U000119E1\U000119E3-\U000119E3\U00011A00-\U00011A00\U00011A0B-\U00011A32\U00011A3A-\U00011A3A\U00011A50-\U00011A50\U00011A5C-\U00011A89\U00011A9D-\U00011A9D\U00011AC0-\U00011AF8\U00011C00-\U00011C08\U00011C0A-\U00011C2E\U00011C40-\U00011C40\U00011C72-\U00011C8F\U00011D00-\U00011D06\U00011D08-\U00011D09\U00011D0B-\U00011D30\U00011D46-\U00011D46\U00011D60-\U00011D65\U00011D67-\U00011D68\U00011D6A-\U00011D89\U00011D98-\U00011D98\U00011EE0-\U00011EF2\U00011FB0-\U00011FB0\U00012000-\U00012399\U00012480-\U00012543\U00013000-\U0001342E\U00014400-\U00014646\U00016800-\U00016A38\U00016A40-\U00016A5E\U00016AD0-\U00016AED\U00016B00-\U00016B2F\U00016B40-\U00016B43\U00016B63-\U00016B77\U00016B7D-\U00016B8F\U00016E40-\U00016E7F\U00016F00-\U00016F4A\U00016F50-\U00016F50\U00016F93-\U00016F9F\U00016FE0-\U00016FE1\U00016FE3-\U00016FE3\U00017000-\U000187F7\U00018800-\U00018CD5\U00018D00-\U00018D08\U0001B000-\U0001B11E\U0001B150-\U0001B152\U0001B164-\U0001B167\U0001B170-\U0001B2FB\U0001BC00-\U0001BC6A\U0001BC70-\U0001BC7C\U0001BC80-\U0001BC88\U0001BC90-\U0001BC99\U0001D400-\U0001D454\U0001D456-\U0001D49C\U0001D49E-\U0001D49F\U0001D4A2-\U0001D4A2\U0001D4A5-\U0001D4A6\U0001D4A9-\U0001D4AC\U0001D4AE-\U0001D4B9\U0001D4BB-\U0001D4BB\U0001D4BD-\U0001D4C3\U0001D4C5-\U0001D505\U0001D507-\U0001D50A\U0001D50D-\U0001D514\U0001D516-\U0001D51C\U0001D51E-\U0001D539\U0001D53B-\U0001D53E\U0001D540-\U0001D544\U0001D546-\U0001D546\U0001D54A-\U0001D550\U0001D552-\U0001D6A5\U0001D6A8-\U0001D6C0\U0001D6C2-\U0001D6DA\U0001D6DC-\U0001D6FA\U0001D6FC-\U0001D714\U0001D716-\U0001D734\U0001D736-\U0001D74E\U0001D750-\U0001D76E\U0001D770-\U0001D788\U0001D78A-\U0001D7A8\U0001D7AA-\U0001D7C2\U0001D7C4-\U0001D7CB\U0001E100-\U0001E12C\U0001E137-\U0001E13D\U0001E14E-\U0001E14E\U0001E2C0-\U0001E2EB\U0001E800-\U0001E8C4\U0001E900-\U0001E943\U0001E94B-\U0001E94B\U0001EE00-\U0001EE03\U0001EE05-\U0001EE1F\U0001EE21-\U0001EE22\U0001EE24-\U0001EE24\U0001EE27-\U0001EE27\U0001EE29-\U0001EE32\U0001EE34-\U0001EE37\U0001EE39-\U0001EE39\U0001EE3B-\U0001EE3B\U0001EE42-\U0001EE42\U0001EE47-\U0001EE47\U0001EE49-\U0001EE49\U0001EE4B-\U0001EE4B\U0001EE4D-\U0001EE4F\U0001EE51-\U0001EE52\U0001EE54-\U0001EE54\U0001EE57-\U0001EE57\U0001EE59-\U0001EE59\U0001EE5B-\U0001EE5B\U0001EE5D-\U0001EE5D\U0001EE5F-\U0001EE5F\U0001EE61-\U0001EE62\U0001EE64-\U0001EE64\U0001EE67-\U0001EE6A\U0001EE6C-\U0001EE72\U0001EE74-\U0001EE77\U0001EE79-\U0001EE7C\U0001EE7E-\U0001EE7E\U0001EE80-\U0001EE89\U0001EE8B-\U0001EE9B\U0001EEA1-\U0001EEA3\U0001EEA5-\U0001EEA9\U0001EEAB-\U0001EEBB\U00020000-\U0002A6DD\U0002A700-\U0002B734\U0002B740-\U0002B81D\U0002B820-\U0002CEA1\U0002CEB0-\U0002EBE0\U0002F800-\U0002FA1D\U00030000-\U0003134A]+| ?[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]+| ?[^\\s\U00000041-\U0000005A\U00000061-\U0000007A\U000000AA-\U000000AA\U000000B5-\U000000B5\U000000BA-\U000000BA\U000000C0-\U000000D6\U000000D8-\U000000F6\U000000F8-\U000002C1\U000002C6-\U000002D1\U000002E0-\U000002E4\U000002EC-\U000002EC\U000002EE-\U000002EE\U00000370-\U00000374\U00000376-\U00000377\U0000037A-\U0000037D\U0000037F-\U0000037F\U00000386-\U00000386\U00000388-\U0000038A\U0000038C-\U0000038C\U0000038E-\U000003A1\U000003A3-\U000003F5\U000003F7-\U00000481\U0000048A-\U0000052F\U00000531-\U00000556\U00000559-\U00000559\U00000560-\U00000588\U000005D0-\U000005EA\U000005EF-\U000005F2\U00000620-\U0000064A\U0000066E-\U0000066F\U00000671-\U000006D3\U000006D5-\U000006D5\U000006E5-\U000006E6\U000006EE-\U000006EF\U000006FA-\U000006FC\U000006FF-\U000006FF\U00000710-\U00000710\U00000712-\U0000072F\U0000074D-\U000007A5\U000007B1-\U000007B1\U000007CA-\U000007EA\U000007F4-\U000007F5\U000007FA-\U000007FA\U00000800-\U00000815\U0000081A-\U0000081A\U00000824-\U00000824\U00000828-\U00000828\U00000840-\U00000858\U00000860-\U0000086A\U000008A0-\U000008B4\U000008B6-\U000008C7\U00000904-\U00000939\U0000093D-\U0000093D\U00000950-\U00000950\U00000958-\U00000961\U00000971-\U00000980\U00000985-\U0000098C\U0000098F-\U00000990\U00000993-\U000009A8\U000009AA-\U000009B0\U000009B2-\U000009B2\U000009B6-\U000009B9\U000009BD-\U000009BD\U000009CE-\U000009CE\U000009DC-\U000009DD\U000009DF-\U000009E1\U000009F0-\U000009F1\U000009FC-\U000009FC\U00000A05-\U00000A0A\U00000A0F-\U00000A10\U00000A13-\U00000A28\U00000A2A-\U00000A30\U00000A32-\U00000A33\U00000A35-\U00000A36\U00000A38-\U00000A39\U00000A59-\U00000A5C\U00000A5E-\U00000A5E\U00000A72-\U00000A74\U00000A85-\U00000A8D\U00000A8F-\U00000A91\U00000A93-\U00000AA8\U00000AAA-\U00000AB0\U00000AB2-\U00000AB3\U00000AB5-\U00000AB9\U00000ABD-\U00000ABD\U00000AD0-\U00000AD0\U00000AE0-\U00000AE1\U00000AF9-\U00000AF9\U00000B05-\U00000B0C\U00000B0F-\U00000B10\U00000B13-\U00000B28\U00000B2A-\U00000B30\U00000B32-\U00000B33\U00000B35-\U00000B39\U00000B3D-\U00000B3D\U00000B5C-\U00000B5D\U00000B5F-\U00000B61\U00000B71-\U00000B71\U00000B83-\U00000B83\U00000B85-\U00000B8A\U00000B8E-\U00000B90\U00000B92-\U00000B95\U00000B99-\U00000B9A\U00000B9C-\U00000B9C\U00000B9E-\U00000B9F\U00000BA3-\U00000BA4\U00000BA8-\U00000BAA\U00000BAE-\U00000BB9\U00000BD0-\U00000BD0\U00000C05-\U00000C0C\U00000C0E-\U00000C10\U00000C12-\U00000C28\U00000C2A-\U00000C39\U00000C3D-\U00000C3D\U00000C58-\U00000C5A\U00000C60-\U00000C61\U00000C80-\U00000C80\U00000C85-\U00000C8C\U00000C8E-\U00000C90\U00000C92-\U00000CA8\U00000CAA-\U00000CB3\U00000CB5-\U00000CB9\U00000CBD-\U00000CBD\U00000CDE-\U00000CDE\U00000CE0-\U00000CE1\U00000CF1-\U00000CF2\U00000D04-\U00000D0C\U00000D0E-\U00000D10\U00000D12-\U00000D3A\U00000D3D-\U00000D3D\U00000D4E-\U00000D4E\U00000D54-\U00000D56\U00000D5F-\U00000D61\U00000D7A-\U00000D7F\U00000D85-\U00000D96\U00000D9A-\U00000DB1\U00000DB3-\U00000DBB\U00000DBD-\U00000DBD\U00000DC0-\U00000DC6\U00000E01-\U00000E30\U00000E32-\U00000E33\U00000E40-\U00000E46\U00000E81-\U00000E82\U00000E84-\U00000E84\U00000E86-\U00000E8A\U00000E8C-\U00000EA3\U00000EA5-\U00000EA5\U00000EA7-\U00000EB0\U00000EB2-\U00000EB3\U00000EBD-\U00000EBD\U00000EC0-\U00000EC4\U00000EC6-\U00000EC6\U00000EDC-\U00000EDF\U00000F00-\U00000F00\U00000F40-\U00000F47\U00000F49-\U00000F6C\U00000F88-\U00000F8C\U00001000-\U0000102A\U0000103F-\U0000103F\U00001050-\U00001055\U0000105A-\U0000105D\U00001061-\U00001061\U00001065-\U00001066\U0000106E-\U00001070\U00001075-\U00001081\U0000108E-\U0000108E\U000010A0-\U000010C5\U000010C7-\U000010C7\U000010CD-\U000010CD\U000010D0-\U000010FA\U000010FC-\U00001248\U0000124A-\U0000124D\U00001250-\U00001256\U00001258-\U00001258\U0000125A-\U0000125D\U00001260-\U00001288\U0000128A-\U0000128D\U00001290-\U000012B0\U000012B2-\U000012B5\U000012B8-\U000012BE\U000012C0-\U000012C0\U000012C2-\U000012C5\U000012C8-\U000012D6\U000012D8-\U00001310\U00001312-\U00001315\U00001318-\U0000135A\U00001380-\U0000138F\U000013A0-\U000013F5\U000013F8-\U000013FD\U00001401-\U0000166C\U0000166F-\U0000167F\U00001681-\U0000169A\U000016A0-\U000016EA\U000016F1-\U000016F8\U00001700-\U0000170C\U0000170E-\U00001711\U00001720-\U00001731\U00001740-\U00001751\U00001760-\U0000176C\U0000176E-\U00001770\U00001780-\U000017B3\U000017D7-\U000017D7\U000017DC-\U000017DC\U00001820-\U00001878\U00001880-\U00001884\U00001887-\U000018A8\U000018AA-\U000018AA\U000018B0-\U000018F5\U00001900-\U0000191E\U00001950-\U0000196D\U00001970-\U00001974\U00001980-\U000019AB\U000019B0-\U000019C9\U00001A00-\U00001A16\U00001A20-\U00001A54\U00001AA7-\U00001AA7\U00001B05-\U00001B33\U00001B45-\U00001B4B\U00001B83-\U00001BA0\U00001BAE-\U00001BAF\U00001BBA-\U00001BE5\U00001C00-\U00001C23\U00001C4D-\U00001C4F\U00001C5A-\U00001C7D\U00001C80-\U00001C88\U00001C90-\U00001CBA\U00001CBD-\U00001CBF\U00001CE9-\U00001CEC\U00001CEE-\U00001CF3\U00001CF5-\U00001CF6\U00001CFA-\U00001CFA\U00001D00-\U00001DBF\U00001E00-\U00001F15\U00001F18-\U00001F1D\U00001F20-\U00001F45\U00001F48-\U00001F4D\U00001F50-\U00001F57\U00001F59-\U00001F59\U00001F5B-\U00001F5B\U00001F5D-\U00001F5D\U00001F5F-\U00001F7D\U00001F80-\U00001FB4\U00001FB6-\U00001FBC\U00001FBE-\U00001FBE\U00001FC2-\U00001FC4\U00001FC6-\U00001FCC\U00001FD0-\U00001FD3\U00001FD6-\U00001FDB\U00001FE0-\U00001FEC\U00001FF2-\U00001FF4\U00001FF6-\U00001FFC\U00002071-\U00002071\U0000207F-\U0000207F\U00002090-\U0000209C\U00002102-\U00002102\U00002107-\U00002107\U0000210A-\U00002113\U00002115-\U00002115\U00002119-\U0000211D\U00002124-\U00002124\U00002126-\U00002126\U00002128-\U00002128\U0000212A-\U0000212D\U0000212F-\U00002139\U0000213C-\U0000213F\U00002145-\U00002149\U0000214E-\U0000214E\U00002183-\U00002184\U00002C00-\U00002C2E\U00002C30-\U00002C5E\U00002C60-\U00002CE4\U00002CEB-\U00002CEE\U00002CF2-\U00002CF3\U00002D00-\U00002D25\U00002D27-\U00002D27\U00002D2D-\U00002D2D\U00002D30-\U00002D67\U00002D6F-\U00002D6F\U00002D80-\U00002D96\U00002DA0-\U00002DA6\U00002DA8-\U00002DAE\U00002DB0-\U00002DB6\U00002DB8-\U00002DBE\U00002DC0-\U00002DC6\U00002DC8-\U00002DCE\U00002DD0-\U00002DD6\U00002DD8-\U00002DDE\U00002E2F-\U00002E2F\U00003005-\U00003006\U00003031-\U00003035\U0000303B-\U0000303C\U00003041-\U00003096\U0000309D-\U0000309F\U000030A1-\U000030FA\U000030FC-\U000030FF\U00003105-\U0000312F\U00003131-\U0000318E\U000031A0-\U000031BF\U000031F0-\U000031FF\U00003400-\U00004DBF\U00004E00-\U00009FFC\U0000A000-\U0000A48C\U0000A4D0-\U0000A4FD\U0000A500-\U0000A60C\U0000A610-\U0000A61F\U0000A62A-\U0000A62B\U0000A640-\U0000A66E\U0000A67F-\U0000A69D\U0000A6A0-\U0000A6E5\U0000A717-\U0000A71F\U0000A722-\U0000A788\U0000A78B-\U0000A7BF\U0000A7C2-\U0000A7CA\U0000A7F5-\U0000A801\U0000A803-\U0000A805\U0000A807-\U0000A80A\U0000A80C-\U0000A822\U0000A840-\U0000A873\U0000A882-\U0000A8B3\U0000A8F2-\U0000A8F7\U0000A8FB-\U0000A8FB\U0000A8FD-\U0000A8FE\U0000A90A-\U0000A925\U0000A930-\U0000A946\U0000A960-\U0000A97C\U0000A984-\U0000A9B2\U0000A9CF-\U0000A9CF\U0000A9E0-\U0000A9E4\U0000A9E6-\U0000A9EF\U0000A9FA-\U0000A9FE\U0000AA00-\U0000AA28\U0000AA40-\U0000AA42\U0000AA44-\U0000AA4B\U0000AA60-\U0000AA76\U0000AA7A-\U0000AA7A\U0000AA7E-\U0000AAAF\U0000AAB1-\U0000AAB1\U0000AAB5-\U0000AAB6\U0000AAB9-\U0000AABD\U0000AAC0-\U0000AAC0\U0000AAC2-\U0000AAC2\U0000AADB-\U0000AADD\U0000AAE0-\U0000AAEA\U0000AAF2-\U0000AAF4\U0000AB01-\U0000AB06\U0000AB09-\U0000AB0E\U0000AB11-\U0000AB16\U0000AB20-\U0000AB26\U0000AB28-\U0000AB2E\U0000AB30-\U0000AB5A\U0000AB5C-\U0000AB69\U0000AB70-\U0000ABE2\U0000AC00-\U0000D7A3\U0000D7B0-\U0000D7C6\U0000D7CB-\U0000D7FB\U0000F900-\U0000FA6D\U0000FA70-\U0000FAD9\U0000FB00-\U0000FB06\U0000FB13-\U0000FB17\U0000FB1D-\U0000FB1D\U0000FB1F-\U0000FB28\U0000FB2A-\U0000FB36\U0000FB38-\U0000FB3C\U0000FB3E-\U0000FB3E\U0000FB40-\U0000FB41\U0000FB43-\U0000FB44\U0000FB46-\U0000FBB1\U0000FBD3-\U0000FD3D\U0000FD50-\U0000FD8F\U0000FD92-\U0000FDC7\U0000FDF0-\U0000FDFB\U0000FE70-\U0000FE74\U0000FE76-\U0000FEFC\U0000FF21-\U0000FF3A\U0000FF41-\U0000FF5A\U0000FF66-\U0000FFBE\U0000FFC2-\U0000FFC7\U0000FFCA-\U0000FFCF\U0000FFD2-\U0000FFD7\U0000FFDA-\U0000FFDC\U00010000-\U0001000B\U0001000D-\U00010026\U00010028-\U0001003A\U0001003C-\U0001003D\U0001003F-\U0001004D\U00010050-\U0001005D\U00010080-\U000100FA\U00010280-\U0001029C\U000102A0-\U000102D0\U00010300-\U0001031F\U0001032D-\U00010340\U00010342-\U00010349\U00010350-\U00010375\U00010380-\U0001039D\U000103A0-\U000103C3\U000103C8-\U000103CF\U00010400-\U0001049D\U000104B0-\U000104D3\U000104D8-\U000104FB\U00010500-\U00010527\U00010530-\U00010563\U00010600-\U00010736\U00010740-\U00010755\U00010760-\U00010767\U00010800-\U00010805\U00010808-\U00010808\U0001080A-\U00010835\U00010837-\U00010838\U0001083C-\U0001083C\U0001083F-\U00010855\U00010860-\U00010876\U00010880-\U0001089E\U000108E0-\U000108F2\U000108F4-\U000108F5\U00010900-\U00010915\U00010920-\U00010939\U00010980-\U000109B7\U000109BE-\U000109BF\U00010A00-\U00010A00\U00010A10-\U00010A13\U00010A15-\U00010A17\U00010A19-\U00010A35\U00010A60-\U00010A7C\U00010A80-\U00010A9C\U00010AC0-\U00010AC7\U00010AC9-\U00010AE4\U00010B00-\U00010B35\U00010B40-\U00010B55\U00010B60-\U00010B72\U00010B80-\U00010B91\U00010C00-\U00010C48\U00010C80-\U00010CB2\U00010CC0-\U00010CF2\U00010D00-\U00010D23\U00010E80-\U00010EA9\U00010EB0-\U00010EB1\U00010F00-\U00010F1C\U00010F27-\U00010F27\U00010F30-\U00010F45\U00010FB0-\U00010FC4\U00010FE0-\U00010FF6\U00011003-\U00011037\U00011083-\U000110AF\U000110D0-\U000110E8\U00011103-\U00011126\U00011144-\U00011144\U00011147-\U00011147\U00011150-\U00011172\U00011176-\U00011176\U00011183-\U000111B2\U000111C1-\U000111C4\U000111DA-\U000111DA\U000111DC-\U000111DC\U00011200-\U00011211\U00011213-\U0001122B\U00011280-\U00011286\U00011288-\U00011288\U0001128A-\U0001128D\U0001128F-\U0001129D\U0001129F-\U000112A8\U000112B0-\U000112DE\U00011305-\U0001130C\U0001130F-\U00011310\U00011313-\U00011328\U0001132A-\U00011330\U00011332-\U00011333\U00011335-\U00011339\U0001133D-\U0001133D\U00011350-\U00011350\U0001135D-\U00011361\U00011400-\U00011434\U00011447-\U0001144A\U0001145F-\U00011461\U00011480-\U000114AF\U000114C4-\U000114C5\U000114C7-\U000114C7\U00011580-\U000115AE\U000115D8-\U000115DB\U00011600-\U0001162F\U00011644-\U00011644\U00011680-\U000116AA\U000116B8-\U000116B8\U00011700-\U0001171A\U00011800-\U0001182B\U000118A0-\U000118DF\U000118FF-\U00011906\U00011909-\U00011909\U0001190C-\U00011913\U00011915-\U00011916\U00011918-\U0001192F\U0001193F-\U0001193F\U00011941-\U00011941\U000119A0-\U000119A7\U000119AA-\U000119D0\U000119E1-\U000119E1\U000119E3-\U000119E3\U00011A00-\U00011A00\U00011A0B-\U00011A32\U00011A3A-\U00011A3A\U00011A50-\U00011A50\U00011A5C-\U00011A89\U00011A9D-\U00011A9D\U00011AC0-\U00011AF8\U00011C00-\U00011C08\U00011C0A-\U00011C2E\U00011C40-\U00011C40\U00011C72-\U00011C8F\U00011D00-\U00011D06\U00011D08-\U00011D09\U00011D0B-\U00011D30\U00011D46-\U00011D46\U00011D60-\U00011D65\U00011D67-\U00011D68\U00011D6A-\U00011D89\U00011D98-\U00011D98\U00011EE0-\U00011EF2\U00011FB0-\U00011FB0\U00012000-\U00012399\U00012480-\U00012543\U00013000-\U0001342E\U00014400-\U00014646\U00016800-\U00016A38\U00016A40-\U00016A5E\U00016AD0-\U00016AED\U00016B00-\U00016B2F\U00016B40-\U00016B43\U00016B63-\U00016B77\U00016B7D-\U00016B8F\U00016E40-\U00016E7F\U00016F00-\U00016F4A\U00016F50-\U00016F50\U00016F93-\U00016F9F\U00016FE0-\U00016FE1\U00016FE3-\U00016FE3\U00017000-\U000187F7\U00018800-\U00018CD5\U00018D00-\U00018D08\U0001B000-\U0001B11E\U0001B150-\U0001B152\U0001B164-\U0001B167\U0001B170-\U0001B2FB\U0001BC00-\U0001BC6A\U0001BC70-\U0001BC7C\U0001BC80-\U0001BC88\U0001BC90-\U0001BC99\U0001D400-\U0001D454\U0001D456-\U0001D49C\U0001D49E-\U0001D49F\U0001D4A2-\U0001D4A2\U0001D4A5-\U0001D4A6\U0001D4A9-\U0001D4AC\U0001D4AE-\U0001D4B9\U0001D4BB-\U0001D4BB\U0001D4BD-\U0001D4C3\U0001D4C5-\U0001D505\U0001D507-\U0001D50A\U0001D50D-\U0001D514\U0001D516-\U0001D51C\U0001D51E-\U0001D539\U0001D53B-\U0001D53E\U0001D540-\U0001D544\U0001D546-\U0001D546\U0001D54A-\U0001D550\U0001D552-\U0001D6A5\U0001D6A8-\U0001D6C0\U0001D6C2-\U0001D6DA\U0001D6DC-\U0001D6FA\U0001D6FC-\U0001D714\U0001D716-\U0001D734\U0001D736-\U0001D74E\U0001D750-\U0001D76E\U0001D770-\U0001D788\U0001D78A-\U0001D7A8\U0001D7AA-\U0001D7C2\U0001D7C4-\U0001D7CB\U0001E100-\U0001E12C\U0001E137-\U0001E13D\U0001E14E-\U0001E14E\U0001E2C0-\U0001E2EB\U0001E800-\U0001E8C4\U0001E900-\U0001E943\U0001E94B-\U0001E94B\U0001EE00-\U0001EE03\U0001EE05-\U0001EE1F\U0001EE21-\U0001EE22\U0001EE24-\U0001EE24\U0001EE27-\U0001EE27\U0001EE29-\U0001EE32\U0001EE34-\U0001EE37\U0001EE39-\U0001EE39\U0001EE3B-\U0001EE3B\U0001EE42-\U0001EE42\U0001EE47-\U0001EE47\U0001EE49-\U0001EE49\U0001EE4B-\U0001EE4B\U0001EE4D-\U0001EE4F\U0001EE51-\U0001EE52\U0001EE54-\U0001EE54\U0001EE57-\U0001EE57\U0001EE59-\U0001EE59\U0001EE5B-\U0001EE5B\U0001EE5D-\U0001EE5D\U0001EE5F-\U0001EE5F\U0001EE61-\U0001EE62\U0001EE64-\U0001EE64\U0001EE67-\U0001EE6A\U0001EE6C-\U0001EE72\U0001EE74-\U0001EE77\U0001EE79-\U0001EE7C\U0001EE7E-\U0001EE7E\U0001EE80-\U0001EE89\U0001EE8B-\U0001EE9B\U0001EEA1-\U0001EEA3\U0001EEA5-\U0001EEA9\U0001EEAB-\U0001EEBB\U00020000-\U0002A6DD\U0002A700-\U0002B734\U0002B740-\U0002B81D\U0002B820-\U0002CEA1\U0002CEB0-\U0002EBE0\U0002F800-\U0002FA1D\U00030000-\U0003134A\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]+|\\s+(?!\\S)", + //digits + L"[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]+", + L"[0-9][0-9][0-9]" + }; + +static const std::vector deepseek_coder_regex = { + L"[\r\n]", + //\s?\p{L}+ + L"\\s?[\U00000041-\U0000005A\U00000061-\U0000007A\U000000AA-\U000000AA\U000000B5-\U000000B5\U000000BA-\U000000BA\U000000C0-\U000000D6\U000000D8-\U000000F6\U000000F8-\U000002C1\U000002C6-\U000002D1\U000002E0-\U000002E4\U000002EC-\U000002EC\U000002EE-\U000002EE\U00000370-\U00000374\U00000376-\U00000377\U0000037A-\U0000037D\U0000037F-\U0000037F\U00000386-\U00000386\U00000388-\U0000038A\U0000038C-\U0000038C\U0000038E-\U000003A1\U000003A3-\U000003F5\U000003F7-\U00000481\U0000048A-\U0000052F\U00000531-\U00000556\U00000559-\U00000559\U00000560-\U00000588\U000005D0-\U000005EA\U000005EF-\U000005F2\U00000620-\U0000064A\U0000066E-\U0000066F\U00000671-\U000006D3\U000006D5-\U000006D5\U000006E5-\U000006E6\U000006EE-\U000006EF\U000006FA-\U000006FC\U000006FF-\U000006FF\U00000710-\U00000710\U00000712-\U0000072F\U0000074D-\U000007A5\U000007B1-\U000007B1\U000007CA-\U000007EA\U000007F4-\U000007F5\U000007FA-\U000007FA\U00000800-\U00000815\U0000081A-\U0000081A\U00000824-\U00000824\U00000828-\U00000828\U00000840-\U00000858\U00000860-\U0000086A\U000008A0-\U000008B4\U000008B6-\U000008C7\U00000904-\U00000939\U0000093D-\U0000093D\U00000950-\U00000950\U00000958-\U00000961\U00000971-\U00000980\U00000985-\U0000098C\U0000098F-\U00000990\U00000993-\U000009A8\U000009AA-\U000009B0\U000009B2-\U000009B2\U000009B6-\U000009B9\U000009BD-\U000009BD\U000009CE-\U000009CE\U000009DC-\U000009DD\U000009DF-\U000009E1\U000009F0-\U000009F1\U000009FC-\U000009FC\U00000A05-\U00000A0A\U00000A0F-\U00000A10\U00000A13-\U00000A28\U00000A2A-\U00000A30\U00000A32-\U00000A33\U00000A35-\U00000A36\U00000A38-\U00000A39\U00000A59-\U00000A5C\U00000A5E-\U00000A5E\U00000A72-\U00000A74\U00000A85-\U00000A8D\U00000A8F-\U00000A91\U00000A93-\U00000AA8\U00000AAA-\U00000AB0\U00000AB2-\U00000AB3\U00000AB5-\U00000AB9\U00000ABD-\U00000ABD\U00000AD0-\U00000AD0\U00000AE0-\U00000AE1\U00000AF9-\U00000AF9\U00000B05-\U00000B0C\U00000B0F-\U00000B10\U00000B13-\U00000B28\U00000B2A-\U00000B30\U00000B32-\U00000B33\U00000B35-\U00000B39\U00000B3D-\U00000B3D\U00000B5C-\U00000B5D\U00000B5F-\U00000B61\U00000B71-\U00000B71\U00000B83-\U00000B83\U00000B85-\U00000B8A\U00000B8E-\U00000B90\U00000B92-\U00000B95\U00000B99-\U00000B9A\U00000B9C-\U00000B9C\U00000B9E-\U00000B9F\U00000BA3-\U00000BA4\U00000BA8-\U00000BAA\U00000BAE-\U00000BB9\U00000BD0-\U00000BD0\U00000C05-\U00000C0C\U00000C0E-\U00000C10\U00000C12-\U00000C28\U00000C2A-\U00000C39\U00000C3D-\U00000C3D\U00000C58-\U00000C5A\U00000C60-\U00000C61\U00000C80-\U00000C80\U00000C85-\U00000C8C\U00000C8E-\U00000C90\U00000C92-\U00000CA8\U00000CAA-\U00000CB3\U00000CB5-\U00000CB9\U00000CBD-\U00000CBD\U00000CDE-\U00000CDE\U00000CE0-\U00000CE1\U00000CF1-\U00000CF2\U00000D04-\U00000D0C\U00000D0E-\U00000D10\U00000D12-\U00000D3A\U00000D3D-\U00000D3D\U00000D4E-\U00000D4E\U00000D54-\U00000D56\U00000D5F-\U00000D61\U00000D7A-\U00000D7F\U00000D85-\U00000D96\U00000D9A-\U00000DB1\U00000DB3-\U00000DBB\U00000DBD-\U00000DBD\U00000DC0-\U00000DC6\U00000E01-\U00000E30\U00000E32-\U00000E33\U00000E40-\U00000E46\U00000E81-\U00000E82\U00000E84-\U00000E84\U00000E86-\U00000E8A\U00000E8C-\U00000EA3\U00000EA5-\U00000EA5\U00000EA7-\U00000EB0\U00000EB2-\U00000EB3\U00000EBD-\U00000EBD\U00000EC0-\U00000EC4\U00000EC6-\U00000EC6\U00000EDC-\U00000EDF\U00000F00-\U00000F00\U00000F40-\U00000F47\U00000F49-\U00000F6C\U00000F88-\U00000F8C\U00001000-\U0000102A\U0000103F-\U0000103F\U00001050-\U00001055\U0000105A-\U0000105D\U00001061-\U00001061\U00001065-\U00001066\U0000106E-\U00001070\U00001075-\U00001081\U0000108E-\U0000108E\U000010A0-\U000010C5\U000010C7-\U000010C7\U000010CD-\U000010CD\U000010D0-\U000010FA\U000010FC-\U00001248\U0000124A-\U0000124D\U00001250-\U00001256\U00001258-\U00001258\U0000125A-\U0000125D\U00001260-\U00001288\U0000128A-\U0000128D\U00001290-\U000012B0\U000012B2-\U000012B5\U000012B8-\U000012BE\U000012C0-\U000012C0\U000012C2-\U000012C5\U000012C8-\U000012D6\U000012D8-\U00001310\U00001312-\U00001315\U00001318-\U0000135A\U00001380-\U0000138F\U000013A0-\U000013F5\U000013F8-\U000013FD\U00001401-\U0000166C\U0000166F-\U0000167F\U00001681-\U0000169A\U000016A0-\U000016EA\U000016F1-\U000016F8\U00001700-\U0000170C\U0000170E-\U00001711\U00001720-\U00001731\U00001740-\U00001751\U00001760-\U0000176C\U0000176E-\U00001770\U00001780-\U000017B3\U000017D7-\U000017D7\U000017DC-\U000017DC\U00001820-\U00001878\U00001880-\U00001884\U00001887-\U000018A8\U000018AA-\U000018AA\U000018B0-\U000018F5\U00001900-\U0000191E\U00001950-\U0000196D\U00001970-\U00001974\U00001980-\U000019AB\U000019B0-\U000019C9\U00001A00-\U00001A16\U00001A20-\U00001A54\U00001AA7-\U00001AA7\U00001B05-\U00001B33\U00001B45-\U00001B4B\U00001B83-\U00001BA0\U00001BAE-\U00001BAF\U00001BBA-\U00001BE5\U00001C00-\U00001C23\U00001C4D-\U00001C4F\U00001C5A-\U00001C7D\U00001C80-\U00001C88\U00001C90-\U00001CBA\U00001CBD-\U00001CBF\U00001CE9-\U00001CEC\U00001CEE-\U00001CF3\U00001CF5-\U00001CF6\U00001CFA-\U00001CFA\U00001D00-\U00001DBF\U00001E00-\U00001F15\U00001F18-\U00001F1D\U00001F20-\U00001F45\U00001F48-\U00001F4D\U00001F50-\U00001F57\U00001F59-\U00001F59\U00001F5B-\U00001F5B\U00001F5D-\U00001F5D\U00001F5F-\U00001F7D\U00001F80-\U00001FB4\U00001FB6-\U00001FBC\U00001FBE-\U00001FBE\U00001FC2-\U00001FC4\U00001FC6-\U00001FCC\U00001FD0-\U00001FD3\U00001FD6-\U00001FDB\U00001FE0-\U00001FEC\U00001FF2-\U00001FF4\U00001FF6-\U00001FFC\U00002071-\U00002071\U0000207F-\U0000207F\U00002090-\U0000209C\U00002102-\U00002102\U00002107-\U00002107\U0000210A-\U00002113\U00002115-\U00002115\U00002119-\U0000211D\U00002124-\U00002124\U00002126-\U00002126\U00002128-\U00002128\U0000212A-\U0000212D\U0000212F-\U00002139\U0000213C-\U0000213F\U00002145-\U00002149\U0000214E-\U0000214E\U00002183-\U00002184\U00002C00-\U00002C2E\U00002C30-\U00002C5E\U00002C60-\U00002CE4\U00002CEB-\U00002CEE\U00002CF2-\U00002CF3\U00002D00-\U00002D25\U00002D27-\U00002D27\U00002D2D-\U00002D2D\U00002D30-\U00002D67\U00002D6F-\U00002D6F\U00002D80-\U00002D96\U00002DA0-\U00002DA6\U00002DA8-\U00002DAE\U00002DB0-\U00002DB6\U00002DB8-\U00002DBE\U00002DC0-\U00002DC6\U00002DC8-\U00002DCE\U00002DD0-\U00002DD6\U00002DD8-\U00002DDE\U00002E2F-\U00002E2F\U00003005-\U00003006\U00003031-\U00003035\U0000303B-\U0000303C\U00003041-\U00003096\U0000309D-\U0000309F\U000030A1-\U000030FA\U000030FC-\U000030FF\U00003105-\U0000312F\U00003131-\U0000318E\U000031A0-\U000031BF\U000031F0-\U000031FF\U00003400-\U00004DBF\U00004E00-\U00009FFC\U0000A000-\U0000A48C\U0000A4D0-\U0000A4FD\U0000A500-\U0000A60C\U0000A610-\U0000A61F\U0000A62A-\U0000A62B\U0000A640-\U0000A66E\U0000A67F-\U0000A69D\U0000A6A0-\U0000A6E5\U0000A717-\U0000A71F\U0000A722-\U0000A788\U0000A78B-\U0000A7BF\U0000A7C2-\U0000A7CA\U0000A7F5-\U0000A801\U0000A803-\U0000A805\U0000A807-\U0000A80A\U0000A80C-\U0000A822\U0000A840-\U0000A873\U0000A882-\U0000A8B3\U0000A8F2-\U0000A8F7\U0000A8FB-\U0000A8FB\U0000A8FD-\U0000A8FE\U0000A90A-\U0000A925\U0000A930-\U0000A946\U0000A960-\U0000A97C\U0000A984-\U0000A9B2\U0000A9CF-\U0000A9CF\U0000A9E0-\U0000A9E4\U0000A9E6-\U0000A9EF\U0000A9FA-\U0000A9FE\U0000AA00-\U0000AA28\U0000AA40-\U0000AA42\U0000AA44-\U0000AA4B\U0000AA60-\U0000AA76\U0000AA7A-\U0000AA7A\U0000AA7E-\U0000AAAF\U0000AAB1-\U0000AAB1\U0000AAB5-\U0000AAB6\U0000AAB9-\U0000AABD\U0000AAC0-\U0000AAC0\U0000AAC2-\U0000AAC2\U0000AADB-\U0000AADD\U0000AAE0-\U0000AAEA\U0000AAF2-\U0000AAF4\U0000AB01-\U0000AB06\U0000AB09-\U0000AB0E\U0000AB11-\U0000AB16\U0000AB20-\U0000AB26\U0000AB28-\U0000AB2E\U0000AB30-\U0000AB5A\U0000AB5C-\U0000AB69\U0000AB70-\U0000ABE2\U0000AC00-\U0000D7A3\U0000D7B0-\U0000D7C6\U0000D7CB-\U0000D7FB\U0000F900-\U0000FA6D\U0000FA70-\U0000FAD9\U0000FB00-\U0000FB06\U0000FB13-\U0000FB17\U0000FB1D-\U0000FB1D\U0000FB1F-\U0000FB28\U0000FB2A-\U0000FB36\U0000FB38-\U0000FB3C\U0000FB3E-\U0000FB3E\U0000FB40-\U0000FB41\U0000FB43-\U0000FB44\U0000FB46-\U0000FBB1\U0000FBD3-\U0000FD3D\U0000FD50-\U0000FD8F\U0000FD92-\U0000FDC7\U0000FDF0-\U0000FDFB\U0000FE70-\U0000FE74\U0000FE76-\U0000FEFC\U0000FF21-\U0000FF3A\U0000FF41-\U0000FF5A\U0000FF66-\U0000FFBE\U0000FFC2-\U0000FFC7\U0000FFCA-\U0000FFCF\U0000FFD2-\U0000FFD7\U0000FFDA-\U0000FFDC\U00010000-\U0001000B\U0001000D-\U00010026\U00010028-\U0001003A\U0001003C-\U0001003D\U0001003F-\U0001004D\U00010050-\U0001005D\U00010080-\U000100FA\U00010280-\U0001029C\U000102A0-\U000102D0\U00010300-\U0001031F\U0001032D-\U00010340\U00010342-\U00010349\U00010350-\U00010375\U00010380-\U0001039D\U000103A0-\U000103C3\U000103C8-\U000103CF\U00010400-\U0001049D\U000104B0-\U000104D3\U000104D8-\U000104FB\U00010500-\U00010527\U00010530-\U00010563\U00010600-\U00010736\U00010740-\U00010755\U00010760-\U00010767\U00010800-\U00010805\U00010808-\U00010808\U0001080A-\U00010835\U00010837-\U00010838\U0001083C-\U0001083C\U0001083F-\U00010855\U00010860-\U00010876\U00010880-\U0001089E\U000108E0-\U000108F2\U000108F4-\U000108F5\U00010900-\U00010915\U00010920-\U00010939\U00010980-\U000109B7\U000109BE-\U000109BF\U00010A00-\U00010A00\U00010A10-\U00010A13\U00010A15-\U00010A17\U00010A19-\U00010A35\U00010A60-\U00010A7C\U00010A80-\U00010A9C\U00010AC0-\U00010AC7\U00010AC9-\U00010AE4\U00010B00-\U00010B35\U00010B40-\U00010B55\U00010B60-\U00010B72\U00010B80-\U00010B91\U00010C00-\U00010C48\U00010C80-\U00010CB2\U00010CC0-\U00010CF2\U00010D00-\U00010D23\U00010E80-\U00010EA9\U00010EB0-\U00010EB1\U00010F00-\U00010F1C\U00010F27-\U00010F27\U00010F30-\U00010F45\U00010FB0-\U00010FC4\U00010FE0-\U00010FF6\U00011003-\U00011037\U00011083-\U000110AF\U000110D0-\U000110E8\U00011103-\U00011126\U00011144-\U00011144\U00011147-\U00011147\U00011150-\U00011172\U00011176-\U00011176\U00011183-\U000111B2\U000111C1-\U000111C4\U000111DA-\U000111DA\U000111DC-\U000111DC\U00011200-\U00011211\U00011213-\U0001122B\U00011280-\U00011286\U00011288-\U00011288\U0001128A-\U0001128D\U0001128F-\U0001129D\U0001129F-\U000112A8\U000112B0-\U000112DE\U00011305-\U0001130C\U0001130F-\U00011310\U00011313-\U00011328\U0001132A-\U00011330\U00011332-\U00011333\U00011335-\U00011339\U0001133D-\U0001133D\U00011350-\U00011350\U0001135D-\U00011361\U00011400-\U00011434\U00011447-\U0001144A\U0001145F-\U00011461\U00011480-\U000114AF\U000114C4-\U000114C5\U000114C7-\U000114C7\U00011580-\U000115AE\U000115D8-\U000115DB\U00011600-\U0001162F\U00011644-\U00011644\U00011680-\U000116AA\U000116B8-\U000116B8\U00011700-\U0001171A\U00011800-\U0001182B\U000118A0-\U000118DF\U000118FF-\U00011906\U00011909-\U00011909\U0001190C-\U00011913\U00011915-\U00011916\U00011918-\U0001192F\U0001193F-\U0001193F\U00011941-\U00011941\U000119A0-\U000119A7\U000119AA-\U000119D0\U000119E1-\U000119E1\U000119E3-\U000119E3\U00011A00-\U00011A00\U00011A0B-\U00011A32\U00011A3A-\U00011A3A\U00011A50-\U00011A50\U00011A5C-\U00011A89\U00011A9D-\U00011A9D\U00011AC0-\U00011AF8\U00011C00-\U00011C08\U00011C0A-\U00011C2E\U00011C40-\U00011C40\U00011C72-\U00011C8F\U00011D00-\U00011D06\U00011D08-\U00011D09\U00011D0B-\U00011D30\U00011D46-\U00011D46\U00011D60-\U00011D65\U00011D67-\U00011D68\U00011D6A-\U00011D89\U00011D98-\U00011D98\U00011EE0-\U00011EF2\U00011FB0-\U00011FB0\U00012000-\U00012399\U00012480-\U00012543\U00013000-\U0001342E\U00014400-\U00014646\U00016800-\U00016A38\U00016A40-\U00016A5E\U00016AD0-\U00016AED\U00016B00-\U00016B2F\U00016B40-\U00016B43\U00016B63-\U00016B77\U00016B7D-\U00016B8F\U00016E40-\U00016E7F\U00016F00-\U00016F4A\U00016F50-\U00016F50\U00016F93-\U00016F9F\U00016FE0-\U00016FE1\U00016FE3-\U00016FE3\U00017000-\U000187F7\U00018800-\U00018CD5\U00018D00-\U00018D08\U0001B000-\U0001B11E\U0001B150-\U0001B152\U0001B164-\U0001B167\U0001B170-\U0001B2FB\U0001BC00-\U0001BC6A\U0001BC70-\U0001BC7C\U0001BC80-\U0001BC88\U0001BC90-\U0001BC99\U0001D400-\U0001D454\U0001D456-\U0001D49C\U0001D49E-\U0001D49F\U0001D4A2-\U0001D4A2\U0001D4A5-\U0001D4A6\U0001D4A9-\U0001D4AC\U0001D4AE-\U0001D4B9\U0001D4BB-\U0001D4BB\U0001D4BD-\U0001D4C3\U0001D4C5-\U0001D505\U0001D507-\U0001D50A\U0001D50D-\U0001D514\U0001D516-\U0001D51C\U0001D51E-\U0001D539\U0001D53B-\U0001D53E\U0001D540-\U0001D544\U0001D546-\U0001D546\U0001D54A-\U0001D550\U0001D552-\U0001D6A5\U0001D6A8-\U0001D6C0\U0001D6C2-\U0001D6DA\U0001D6DC-\U0001D6FA\U0001D6FC-\U0001D714\U0001D716-\U0001D734\U0001D736-\U0001D74E\U0001D750-\U0001D76E\U0001D770-\U0001D788\U0001D78A-\U0001D7A8\U0001D7AA-\U0001D7C2\U0001D7C4-\U0001D7CB\U0001E100-\U0001E12C\U0001E137-\U0001E13D\U0001E14E-\U0001E14E\U0001E2C0-\U0001E2EB\U0001E800-\U0001E8C4\U0001E900-\U0001E943\U0001E94B-\U0001E94B\U0001EE00-\U0001EE03\U0001EE05-\U0001EE1F\U0001EE21-\U0001EE22\U0001EE24-\U0001EE24\U0001EE27-\U0001EE27\U0001EE29-\U0001EE32\U0001EE34-\U0001EE37\U0001EE39-\U0001EE39\U0001EE3B-\U0001EE3B\U0001EE42-\U0001EE42\U0001EE47-\U0001EE47\U0001EE49-\U0001EE49\U0001EE4B-\U0001EE4B\U0001EE4D-\U0001EE4F\U0001EE51-\U0001EE52\U0001EE54-\U0001EE54\U0001EE57-\U0001EE57\U0001EE59-\U0001EE59\U0001EE5B-\U0001EE5B\U0001EE5D-\U0001EE5D\U0001EE5F-\U0001EE5F\U0001EE61-\U0001EE62\U0001EE64-\U0001EE64\U0001EE67-\U0001EE6A\U0001EE6C-\U0001EE72\U0001EE74-\U0001EE77\U0001EE79-\U0001EE7C\U0001EE7E-\U0001EE7E\U0001EE80-\U0001EE89\U0001EE8B-\U0001EE9B\U0001EEA1-\U0001EEA3\U0001EEA5-\U0001EEA9\U0001EEAB-\U0001EEBB\U00020000-\U0002A6DD\U0002A700-\U0002B734\U0002B740-\U0002B81D\U0002B820-\U0002CEA1\U0002CEB0-\U0002EBE0\U0002F800-\U0002FA1D\U00030000-\U0003134A]+", + //\s?\p{P}+ + L"\\s?[\U00000021-\U00000023\U00000025-\\\U0000002A\U0000002C-\U0000002F\U0000003A-\U0000003B\\\U0000003F-\U00000040\\\U0000005B-\\\U0000005D\U0000005F-\U0000005F\U0000007B-\U0000007B\U0000007D-\U0000007D\U000000A1-\U000000A1\U000000A7-\U000000A7\U000000AB-\U000000AB\U000000B6-\U000000B7\U000000BB-\U000000BB\U000000BF-\U000000BF\U0000037E-\U0000037E\U00000387-\U00000387\U0000055A-\U0000055F\U00000589-\U0000058A\U000005BE-\U000005BE\U000005C0-\U000005C0\U000005C3-\U000005C3\U000005C6-\U000005C6\U000005F3-\U000005F4\U00000609-\U0000060A\U0000060C-\U0000060D\U0000061B-\U0000061B\U0000061E-\U0000061F\U0000066A-\U0000066D\U000006D4-\U000006D4\U00000700-\U0000070D\U000007F7-\U000007F9\U00000830-\U0000083E\U0000085E-\U0000085E\U00000964-\U00000965\U00000970-\U00000970\U000009FD-\U000009FD\U00000A76-\U00000A76\U00000AF0-\U00000AF0\U00000C77-\U00000C77\U00000C84-\U00000C84\U00000DF4-\U00000DF4\U00000E4F-\U00000E4F\U00000E5A-\U00000E5B\U00000F04-\U00000F12\U00000F14-\U00000F14\U00000F3A-\U00000F3D\U00000F85-\U00000F85\U00000FD0-\U00000FD4\U00000FD9-\U00000FDA\U0000104A-\U0000104F\U000010FB-\U000010FB\U00001360-\U00001368\U00001400-\U00001400\U0000166E-\U0000166E\U0000169B-\U0000169C\U000016EB-\U000016ED\U00001735-\U00001736\U000017D4-\U000017D6\U000017D8-\U000017DA\U00001800-\U0000180A\U00001944-\U00001945\U00001A1E-\U00001A1F\U00001AA0-\U00001AA6\U00001AA8-\U00001AAD\U00001B5A-\U00001B60\U00001BFC-\U00001BFF\U00001C3B-\U00001C3F\U00001C7E-\U00001C7F\U00001CC0-\U00001CC7\U00001CD3-\U00001CD3\U00002010-\U00002027\U00002030-\U00002043\U00002045-\U00002051\U00002053-\U0000205E\U0000207D-\U0000207E\U0000208D-\U0000208E\U00002308-\U0000230B\U00002329-\U0000232A\U00002768-\U00002775\U000027C5-\U000027C6\U000027E6-\U000027EF\U00002983-\U00002998\U000029D8-\U000029DB\U000029FC-\U000029FD\U00002CF9-\U00002CFC\U00002CFE-\U00002CFF\U00002D70-\U00002D70\U00002E00-\U00002E2E\U00002E30-\U00002E4F\U00002E52-\U00002E52\U00003001-\U00003003\U00003008-\U00003011\U00003014-\U0000301F\U00003030-\U00003030\U0000303D-\U0000303D\U000030A0-\U000030A0\U000030FB-\U000030FB\U0000A4FE-\U0000A4FF\U0000A60D-\U0000A60F\U0000A673-\U0000A673\U0000A67E-\U0000A67E\U0000A6F2-\U0000A6F7\U0000A874-\U0000A877\U0000A8CE-\U0000A8CF\U0000A8F8-\U0000A8FA\U0000A8FC-\U0000A8FC\U0000A92E-\U0000A92F\U0000A95F-\U0000A95F\U0000A9C1-\U0000A9CD\U0000A9DE-\U0000A9DF\U0000AA5C-\U0000AA5F\U0000AADE-\U0000AADF\U0000AAF0-\U0000AAF1\U0000ABEB-\U0000ABEB\U0000FD3E-\U0000FD3F\U0000FE10-\U0000FE19\U0000FE30-\U0000FE52\U0000FE54-\U0000FE61\U0000FE63-\U0000FE63\U0000FE68-\U0000FE68\U0000FE6A-\U0000FE6B\U0000FF01-\U0000FF03\U0000FF05-\U0000FF0A\U0000FF0C-\U0000FF0F\U0000FF1A-\U0000FF1B\U0000FF1F-\U0000FF20\U0000FF3B-\U0000FF3D\U0000FF3F-\U0000FF3F\U0000FF5B-\U0000FF5B\U0000FF5D-\U0000FF5D\U0000FF5F-\U0000FF65\U00010100-\U00010102\U0001039F-\U0001039F\U000103D0-\U000103D0\U0001056F-\U0001056F\U00010857-\U00010857\U0001091F-\U0001091F\U0001093F-\U0001093F\U00010A50-\U00010A58\U00010A7F-\U00010A7F\U00010AF0-\U00010AF6\U00010B39-\U00010B3F\U00010B99-\U00010B9C\U00010EAD-\U00010EAD\U00010F55-\U00010F59\U00011047-\U0001104D\U000110BB-\U000110BC\U000110BE-\U000110C1\U00011140-\U00011143\U00011174-\U00011175\U000111C5-\U000111C8\U000111CD-\U000111CD\U000111DB-\U000111DB\U000111DD-\U000111DF\U00011238-\U0001123D\U000112A9-\U000112A9\U0001144B-\U0001144F\U0001145A-\U0001145B\U0001145D-\U0001145D\U000114C6-\U000114C6\U000115C1-\U000115D7\U00011641-\U00011643\U00011660-\U0001166C\U0001173C-\U0001173E\U0001183B-\U0001183B\U00011944-\U00011946\U000119E2-\U000119E2\U00011A3F-\U00011A46\U00011A9A-\U00011A9C\U00011A9E-\U00011AA2\U00011C41-\U00011C45\U00011C70-\U00011C71\U00011EF7-\U00011EF8\U00011FFF-\U00011FFF\U00012470-\U00012474\U00016A6E-\U00016A6F\U00016AF5-\U00016AF5\U00016B37-\U00016B3B\U00016B44-\U00016B44\U00016E97-\U00016E9A\U00016FE2-\U00016FE2\U0001BC9F-\U0001BC9F\U0001DA87-\U0001DA8B\U0001E95E-\U0001E95F]+", + //cjk + L"[\u4e00-\u9fa5\u0800-\u4e00\uac00-\ud7ff]+", + //digits + L"[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]" + }; + +static const std::vector deepseek_llm_regex = { + L"[\r\n]", + L"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", + L"\\s?[\u0021-\u002f\u003a-\u007e\uff01-\uff0f\uff1a-\uff5e\u2018-\u201f\u3000-\u3002]+", + L"\\s+$", + L"[\u4e00-\u9fa5\u0800-\u4e00\uac00-\ud7ff]+", + L"[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]" + }; + +inline std::wstring from_utf8(const std::string & s) +{ + std::wstring_convert> conv; + return conv.from_bytes(s); +} + +inline std::string to_utf8(const std::wstring & ws) +{ + // code to convert from utf32/utf16 to utf8 + std::wstring_convert, wchar_t> converter; + std::string utf8 = converter.to_bytes(ws); + return utf8; +} +