Skip to content

Commit

Permalink
resolve parity with MOSES-4.0 release
Browse files Browse the repository at this point in the history
  • Loading branch information
Kevin Heffernan committed Nov 17, 2023
1 parent cd6118e commit ea7691c
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
7 changes: 6 additions & 1 deletion laser_encoders/laser_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@

import sentencepiece as spm
from sacremoses import MosesDetokenizer, MosesPunctNormalizer
from unicategories import categories

from laser_encoders.download_models import LaserModelDownloader
from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE

SPACE_NORMALIZER = re.compile(r"\s+")
NON_PRINT_CHARS = set(c for c in categories["C"].characters())

logging.basicConfig(
stream=sys.stdout,
Expand Down Expand Up @@ -59,6 +61,9 @@ def __init__(

assert spm_model.exists(), f"spm model file: {spm_model} does not exist"
self.moses_punct_normalizer = MosesPunctNormalizer(self.lang, perl_parity=True)
# add parity with MOSES release-4.0
self.moses_punct_normalizer.substitutions[21] = ("‘", r'"')
self.moses_punct_normalizer.substitutions[22] = ("‚", r'"')
self.moses_detokenizer = MosesDetokenizer()
self.spm_encoder = spm.SentencePieceProcessor(model_file=str(self.spm_model))

Expand All @@ -75,7 +80,7 @@ def log(self, message: str) -> None:

def tokenize(self, text: str) -> str:
# Preprocessing
sentence_text = "".join(c for c in text if c.isprintable)
sentence_text = "".join([c if c not in NON_PRINT_CHARS else " " for c in text])
if self.normalize_punct:
sentence_text = self.moses_punct_normalizer.normalize(sentence_text)
if self.descape:
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ readme = "laser_encoders/README.md"
requires-python = ">=3.8"

dependencies = [
'sacremoses>=0.1.0',
'sacremoses==0.1.0',
'unicategories>=0.1.2',
'sentencepiece>=0.1.99',
'numpy>=1.21.3',
'torch>=1.10.0',
Expand Down

0 comments on commit ea7691c

Please sign in to comment.