Enable StableLM2 Tokenizer (openvinotoolkit#845)

apaniukov · Jan 25, 2024 · 9225660 · 9225660
1 parent b540a1d
commit 9225660
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 6 deletions.
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md
@@ -274,8 +274,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     </tr>
     <tr>
       <td >Tiktoken</td>
-      <td >97.69</td>
-      <td >216</td>
+      <td >97.22</td>
+      <td >324</td>
     </tr>
     <tr>
       <td >WordPiece</td>
@@ -519,6 +519,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
       <td >97.22</td>
       <td >108</td>
     </tr>
+    <tr>
+      <td >Tiktoken</td>
+      <td >stabilityai/stablelm-2-1_6b</td>
+      <td >96.30</td>
+      <td >108</td>
+    </tr>
     <tr>
       <td >WordPiece</td>
       <td >ProsusAI/finbert</td>

diff --git a/...es/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/hf_parser.py b/...es/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/hf_parser.py
@@ -562,9 +562,7 @@ def convert_tiktoken_model_tokenizer(
             RegexSplitStep(split_pattern),
             BytesToCharsStep(),
             BPETokenizationStep.from_tiktoken_encoding(encoding),
-            TruncationStep(
-                max_length=hf_tokenizer.model_max_length, truncate_right=(hf_tokenizer.truncation_side == "right")
-            ),
+            TruncationStep.from_hf_object(hf_tokenizer),
             PaddingStep(pad_right=(hf_tokenizer.padding_side == "right")),
             VocabDecoderStep(skip_tokens),
             CharsToBytesStep(),

diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json
@@ -1,3 +1,3 @@
 {
-    "tokenizers_test.py::test_": 0.9096334185848253
+    "tokenizers_test.py::test_": 0.9104394066610692
 }
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py
@@ -120,6 +120,7 @@ def unpack_strings(strings):
     # "t5-base",  # crashes tests
 ]
 tiktiken_models = [
+    "stabilityai/stablelm-2-1_6b",
     "Qwen/Qwen-14B-Chat",
     "Salesforce/xgen-7b-8k-base",
 ]