From 4c401e510fe2020742d7a70f7432575fc75f4663 Mon Sep 17 00:00:00 2001 From: Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:26:47 +0900 Subject: [PATCH] add exaone pre-tokenizer in `llama-vocab.cpp` Co-Authored-By: compilade <113953597+compilade@users.noreply.github.com> --- src/llama-vocab.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 11fffce9386d7..17deefaa86038 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -388,6 +388,7 @@ struct llm_tokenizer_bpe { case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: case LLAMA_VOCAB_PRE_TYPE_SMOLLM: case LLAMA_VOCAB_PRE_TYPE_CODESHELL: + case LLAMA_VOCAB_PRE_TYPE_EXAONE: regex_exprs = { "\\p{N}", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",