From ffe461f283a27372e40e7ca3945308775ddb5a62 Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Tue, 2 Jul 2024 08:58:51 -0700
Subject: [PATCH] Do not use aggressive dash splitting in tokenization (#718)

* Do not use aggressive dash splitting in tokenization

* Add another use case with dash

* Add more test cases
---
 pipeline/alignments/tokenizer.py | 5 ++---
 tests/test_aln_mapping.py        | 4 ++++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/pipeline/alignments/tokenizer.py b/pipeline/alignments/tokenizer.py
index b6388a174..8b1b9968a 100644
--- a/pipeline/alignments/tokenizer.py
+++ b/pipeline/alignments/tokenizer.py
@@ -36,13 +36,12 @@ def _tokenize_lines(params) -> List[str]:
     from mosestokenizer import MosesTokenizer
 
     try:
-        # Use aggressive dash splitting to reduce vocabulary size
-        tokenizer = MosesTokenizer(lang, aggressive_dash_splits=True)
+        tokenizer = MosesTokenizer(lang)
     except RuntimeError as err:
         msg = str(err)
         if "No known abbreviations for language" in msg:
             # Fall-back to English if the language is not found
-            tokenizer = MosesTokenizer("en", aggressive_dash_splits=True)
+            tokenizer = MosesTokenizer("en")
         else:
             raise err
 
diff --git a/tests/test_aln_mapping.py b/tests/test_aln_mapping.py
index 2b9c4f71b..28169a611 100644
--- a/tests/test_aln_mapping.py
+++ b/tests/test_aln_mapping.py
@@ -12,6 +12,9 @@
         ("Hi", {0: 0}),
         ("Hello, world!", {0: 0, 1: 0, 2: 1, 3: 1}),
         ("Hello,  world!", {0: 0, 1: 0, 2: 1, 3: 1}),
+        ("Hello,  half-world and welcome!", {0: 0, 1: 0, 2: 1, 3: 2, 4: 3, 5: 3}),
+        ("Hello - world!", {0: 0, 1: 1, 2: 2, 3: 2}),
+        ("Hello,- world!", {0: 0, 1: 0, 2: 0, 3: 1, 4: 1}),
         (
             "“I will not,” retorted the Witch, “for it is now my shoe, and not yours.”",
             {
@@ -48,6 +51,7 @@ def test_remap_indices(orig, expected_idx_map):
     """
     tokenized = tokenizer.tokenize(orig)
     tokenized_str = " ".join(tokenized)
+    print(tokenized_str)
 
     idx_map = map_indices(tokenized_str, orig)