From a722703135335bbe6d4ae1fa47155dfd25fc573f Mon Sep 17 00:00:00 2001
From: mshannon-sil <131058912+mshannon-sil@users.noreply.github.com>
Date: Tue, 16 Jan 2024 10:05:14 -0500
Subject: [PATCH] use sacremoses normalizer, ensure pretranslate.src.json and
 pretranslate.trg.json use same directory (#96)

* use sacremoses normalizer, ensure pretranslate.src.json and pretranslate.trg.json use same directory

* restore launch.json to commit in main branch

* address efficiency issues

* refactor to have separate uri and folder for shared_file, only normalize with sacremoses for NLLB
---
 machine/jobs/clearml_shared_file_service.py   | 28 +++++++---
 machine/jobs/settings.yaml                    |  5 +-
 machine/jobs/shared_file_service.py           |  7 ++-
 .../huggingface/hugging_face_nmt_engine.py    | 53 ++++++++++++++++---
 .../hugging_face_nmt_model_trainer.py         | 18 ++++---
 5 files changed, 87 insertions(+), 24 deletions(-)

diff --git a/machine/jobs/clearml_shared_file_service.py b/machine/jobs/clearml_shared_file_service.py
index 82213ab1..9b1bdb2a 100644
--- a/machine/jobs/clearml_shared_file_service.py
+++ b/machine/jobs/clearml_shared_file_service.py
@@ -12,7 +12,7 @@
 
 class ClearMLSharedFileService(SharedFileService):
     def _download_file(self, path: str, cache: bool = False) -> Path:
-        uri = f"{self._shared_file_uri}/{path}"
+        uri = f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
         local_folder: Optional[str] = None
         if not cache:
             local_folder = str(self._data_dir)
@@ -22,7 +22,7 @@ def _download_file(self, path: str, cache: bool = False) -> Path:
         return Path(file_path)
 
     def _download_folder(self, path: str, cache: bool = False) -> Path:
-        uri = f"{self._shared_file_uri}/{path}"
+        uri = f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
         local_folder: Optional[str] = None
         if not cache:
             local_folder = str(self._data_dir)
@@ -32,22 +32,36 @@ def _download_folder(self, path: str, cache: bool = False) -> Path:
         return Path(folder_path) / path
 
     def _exists_file(self, path: str) -> bool:
-        uri = f"{self._shared_file_uri}/{path}"
+        uri = f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
         return try_n_times(lambda: StorageManager.exists_file(uri))  # type: ignore
 
     def _upload_file(self, path: str, local_file_path: Path) -> None:
         final_destination = try_n_times(
-            lambda: StorageManager.upload_file(str(local_file_path), f"{self._shared_file_uri}/{path}")
+            lambda: StorageManager.upload_file(
+                str(local_file_path), f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
+            )
         )
         if final_destination is None:
-            logger.error(f"Failed to upload file {str(local_file_path)} to {self._shared_file_uri}/{path}.")
+            logger.error(
+                (
+                    f"Failed to upload file {str(local_file_path)} "
+                    f"to {self._shared_file_uri}/{self._shared_file_folder}/{path}."
+                )
+            )
 
     def _upload_folder(self, path: str, local_folder_path: Path) -> None:
         final_destination = try_n_times(
-            lambda: StorageManager.upload_folder(str(local_folder_path), f"{self._shared_file_uri}/{path}")
+            lambda: StorageManager.upload_folder(
+                str(local_folder_path), f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
+            )
         )
         if final_destination is None:
-            logger.error(f"Failed to upload folder {str(local_folder_path)} to {self._shared_file_uri}/{path}.")
+            logger.error(
+                (
+                    f"Failed to upload folder {str(local_folder_path)} "
+                    f"to {self._shared_file_uri}/{self._shared_file_folder}/{path}."
+                )
+            )
 
 
 def try_n_times(func: Callable, n=10):
diff --git a/machine/jobs/settings.yaml b/machine/jobs/settings.yaml
index 79680269..90edbc0e 100644
--- a/machine/jobs/settings.yaml
+++ b/machine/jobs/settings.yaml
@@ -1,6 +1,8 @@
 default:
   model_type: huggingface
   data_dir: ~/machine
+  shared_file_uri: s3://aqua-ml-data/
+  shared_file_folder: production
   pretranslation_batch_size: 1024
   huggingface:
     parent_model_name: facebook/nllb-200-distilled-1.3B
@@ -25,12 +27,13 @@ default:
       add_unk_src_tokens: true
       add_unk_trg_tokens: true
 development:
-  shared_file_uri: s3://aqua-ml-data/dev/
+  shared_file_folder: dev
   huggingface:
     parent_model_name: facebook/nllb-200-distilled-600M
     generate_params:
       num_beams: 1
 staging:
+  shared_file_folder: ext-qa
   huggingface:
     parent_model_name: hf-internal-testing/tiny-random-nllb
     train_params:
diff --git a/machine/jobs/shared_file_service.py b/machine/jobs/shared_file_service.py
index 4fe8e6e0..0c0cb6b3 100644
--- a/machine/jobs/shared_file_service.py
+++ b/machine/jobs/shared_file_service.py
@@ -64,7 +64,7 @@ def generator() -> Generator[PretranslationInfo, None, None]:
     @contextmanager
     def open_target_pretranslation_writer(self) -> Iterator[PretranslationWriter]:
         build_id: str = self._config.build_id
-        build_dir = self._data_dir / "builds" / build_id
+        build_dir = self._data_dir / self._shared_file_folder / "builds" / build_id
         build_dir.mkdir(parents=True, exist_ok=True)
         target_pretranslate_path = build_dir / "pretranslate.trg.json"
         with target_pretranslate_path.open("w", encoding="utf-8", newline="\n") as file:
@@ -96,6 +96,11 @@ def _shared_file_uri(self) -> str:
         shared_file_uri: str = self._config.shared_file_uri
         return shared_file_uri.rstrip("/")
 
+    @property
+    def _shared_file_folder(self) -> str:
+        shared_file_folder: str = self._config.shared_file_folder
+        return shared_file_folder.rstrip("/")
+
     @abstractmethod
     def _download_file(self, path: str, cache: bool = False) -> Path:
         ...
diff --git a/machine/translation/huggingface/hugging_face_nmt_engine.py b/machine/translation/huggingface/hugging_face_nmt_engine.py
index 1505e972..c4f08ed4 100644
--- a/machine/translation/huggingface/hugging_face_nmt_engine.py
+++ b/machine/translation/huggingface/hugging_face_nmt_engine.py
@@ -2,11 +2,23 @@
 
 import gc
 import logging
+import re
 from math import exp, prod
-from typing import Any, Iterable, List, Sequence, Tuple, Union, cast
+from typing import Any, Iterable, List, Optional, Sequence, Tuple, Union, cast
 
 import torch  # pyright: ignore[reportMissingImports]
-from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel, TranslationPipeline
+from sacremoses import MosesPunctNormalizer
+from transformers import (
+    AutoConfig,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    NllbTokenizer,
+    NllbTokenizerFast,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+    TranslationPipeline,
+)
 from transformers.generation import BeamSearchEncoderDecoderOutput, GreedySearchEncoderDecoderOutput
 from transformers.tokenization_utils import BatchEncoding, TruncationStrategy
 
@@ -38,6 +50,11 @@ def __init__(
                 PreTrainedModel, AutoModelForSeq2SeqLM.from_pretrained(str(self._model), config=model_config)
             )
         self._tokenizer = AutoTokenizer.from_pretrained(self._model.name_or_path, use_fast=True)
+        if isinstance(self._tokenizer, (NllbTokenizer, NllbTokenizerFast)):
+            self._mpn = MosesPunctNormalizer()
+            self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]
+        else:
+            self._mpn = None
 
         src_lang = self._pipeline_kwargs.get("src_lang")
         tgt_lang = self._pipeline_kwargs.get("tgt_lang")
@@ -71,6 +88,7 @@ def __init__(
         self._pipeline = _TranslationPipeline(
             model=self._model,
             tokenizer=self._tokenizer,
+            mpn=self._mpn,
             batch_size=self._batch_size,
             **self._pipeline_kwargs,
         )
@@ -149,15 +167,34 @@ def close(self) -> None:
 
 
 class _TranslationPipeline(TranslationPipeline):
+    def __init__(
+        self,
+        model: Union[PreTrainedModel, StrPath, str],
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+        batch_size: int,
+        mpn: Optional[MosesPunctNormalizer] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(model=model, tokenizer=tokenizer, batch_size=batch_size, **kwargs)
+        self._mpn = mpn
+
     def preprocess(self, *args, truncation=TruncationStrategy.DO_NOT_TRUNCATE, src_lang=None, tgt_lang=None):
         if self.tokenizer is None:
             raise RuntimeError("No tokenizer is specified.")
-        sentences = [
-            s
-            if isinstance(s, str)
-            else self.tokenizer.decode(self.tokenizer.convert_tokens_to_ids(s), use_source_tokenizer=True)
-            for s in args
-        ]
+        if self._mpn:
+            sentences = [
+                self._mpn.normalize(s)
+                if isinstance(s, str)
+                else self.tokenizer.decode(self.tokenizer.convert_tokens_to_ids(s), use_source_tokenizer=True)
+                for s in args
+            ]
+        else:
+            sentences = [
+                s
+                if isinstance(s, str)
+                else self.tokenizer.decode(self.tokenizer.convert_tokens_to_ids(s), use_source_tokenizer=True)
+                for s in args
+            ]
         inputs = cast(
             BatchEncoding, super().preprocess(*sentences, truncation=truncation, src_lang=src_lang, tgt_lang=tgt_lang)
         )
diff --git a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py
index e0738715..f77f1638 100644
--- a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py
+++ b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py
@@ -96,6 +96,8 @@ def __init__(
         self.max_target_length = max_target_length
         self._add_unk_src_tokens = add_unk_src_tokens
         self._add_unk_trg_tokens = add_unk_trg_tokens
+        self._mpn = MosesPunctNormalizer()
+        self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]
 
     @property
     def stats(self) -> TrainStats:
@@ -169,9 +171,8 @@ def find_missing_characters(tokenizer: Any, train_dataset: Dataset, lang_codes:
             for lang_code in lang_codes:
                 for ex in train_dataset["translation"]:
                     charset = charset | set(ex[lang_code])
-            mpn = MosesPunctNormalizer()
-            mpn.substitutions = [(re.compile(r), sub) for r, sub in mpn.substitutions]
-            charset = {mpn.normalize(char) for char in charset}
+            if isinstance(tokenizer, (NllbTokenizerFast)):
+                charset = {self._mpn.normalize(char) for char in charset}
             charset = {tokenizer.backend_tokenizer.normalizer.normalize_str(char) for char in charset}
             charset = set(filter(None, {char.strip() for char in charset}))
             missing_characters = sorted(list(charset - vocab))
@@ -302,11 +303,14 @@ def add_lang_code_to_tokenizer(tokenizer: Any, lang_code: str):
             )
 
         def preprocess_function(examples):
-            inputs = [ex[src_lang] for ex in examples["translation"]]
-            targets = [ex[tgt_lang] for ex in examples["translation"]]
-            inputs = [prefix + inp for inp in inputs]
-            model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True)
+            if isinstance(tokenizer, (NllbTokenizer, NllbTokenizerFast)):
+                inputs = [self._mpn.normalize(prefix + ex[src_lang]) for ex in examples["translation"]]
+                targets = [self._mpn.normalize(ex[tgt_lang]) for ex in examples["translation"]]
+            else:
+                inputs = [prefix + ex[src_lang] for ex in examples["translation"]]
+                targets = [ex[tgt_lang] for ex in examples["translation"]]
 
+            model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True)
             # Tokenize targets with the `text_target` keyword argument
             labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)