From ffe461f283a27372e40e7ca3945308775ddb5a62 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Tue, 2 Jul 2024 08:58:51 -0700 Subject: [PATCH] Do not use aggressive dash splitting in tokenization (#718) * Do not use aggressive dash splitting in tokenization * Add another use case with dash * Add more test cases --- pipeline/alignments/tokenizer.py | 5 ++--- tests/test_aln_mapping.py | 4 ++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pipeline/alignments/tokenizer.py b/pipeline/alignments/tokenizer.py index b6388a174..8b1b9968a 100644 --- a/pipeline/alignments/tokenizer.py +++ b/pipeline/alignments/tokenizer.py @@ -36,13 +36,12 @@ def _tokenize_lines(params) -> List[str]: from mosestokenizer import MosesTokenizer try: - # Use aggressive dash splitting to reduce vocabulary size - tokenizer = MosesTokenizer(lang, aggressive_dash_splits=True) + tokenizer = MosesTokenizer(lang) except RuntimeError as err: msg = str(err) if "No known abbreviations for language" in msg: # Fall-back to English if the language is not found - tokenizer = MosesTokenizer("en", aggressive_dash_splits=True) + tokenizer = MosesTokenizer("en") else: raise err diff --git a/tests/test_aln_mapping.py b/tests/test_aln_mapping.py index 2b9c4f71b..28169a611 100644 --- a/tests/test_aln_mapping.py +++ b/tests/test_aln_mapping.py @@ -12,6 +12,9 @@ ("Hi", {0: 0}), ("Hello, world!", {0: 0, 1: 0, 2: 1, 3: 1}), ("Hello, world!", {0: 0, 1: 0, 2: 1, 3: 1}), + ("Hello, half-world and welcome!", {0: 0, 1: 0, 2: 1, 3: 2, 4: 3, 5: 3}), + ("Hello - world!", {0: 0, 1: 1, 2: 2, 3: 2}), + ("Hello,- world!", {0: 0, 1: 0, 2: 0, 3: 1, 4: 1}), ( "“I will not,” retorted the Witch, “for it is now my shoe, and not yours.”", { @@ -48,6 +51,7 @@ def test_remap_indices(orig, expected_idx_map): """ tokenized = tokenizer.tokenize(orig) tokenized_str = " ".join(tokenized) + print(tokenized_str) idx_map = map_indices(tokenized_str, orig)