Do not use aggressive dash splitting in tokenization (mozilla#718)

* Do not use aggressive dash splitting in tokenization * Add another use case with dash * Add more test cases
mozilla-releng · Aug 29, 2024 · ffe461f · ffe461f
1 parent 78408b8
commit ffe461f
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 3 deletions.
diff --git a/pipeline/alignments/tokenizer.py b/pipeline/alignments/tokenizer.py
@@ -36,13 +36,12 @@ def _tokenize_lines(params) -> List[str]:
     from mosestokenizer import MosesTokenizer
 
     try:
-        # Use aggressive dash splitting to reduce vocabulary size
-        tokenizer = MosesTokenizer(lang, aggressive_dash_splits=True)
+        tokenizer = MosesTokenizer(lang)
     except RuntimeError as err:
         msg = str(err)
         if "No known abbreviations for language" in msg:
             # Fall-back to English if the language is not found
-            tokenizer = MosesTokenizer("en", aggressive_dash_splits=True)
+            tokenizer = MosesTokenizer("en")
         else:
             raise err
 

diff --git a/tests/test_aln_mapping.py b/tests/test_aln_mapping.py
@@ -12,6 +12,9 @@
         ("Hi", {0: 0}),
         ("Hello, world!", {0: 0, 1: 0, 2: 1, 3: 1}),
         ("Hello,  world!", {0: 0, 1: 0, 2: 1, 3: 1}),
+        ("Hello,  half-world and welcome!", {0: 0, 1: 0, 2: 1, 3: 2, 4: 3, 5: 3}),
+        ("Hello - world!", {0: 0, 1: 1, 2: 2, 3: 2}),
+        ("Hello,- world!", {0: 0, 1: 0, 2: 0, 3: 1, 4: 1}),
         (
             "“I will not,” retorted the Witch, “for it is now my shoe, and not yours.”",
             {
@@ -48,6 +51,7 @@ def test_remap_indices(orig, expected_idx_map):
     """
     tokenized = tokenizer.tokenize(orig)
     tokenized_str = " ".join(tokenized)
+    print(tokenized_str)
 
     idx_map = map_indices(tokenized_str, orig)