argosopentech · lecoqnicolas · Jan 31, 2025 · Feb 6, 2025
diff --git a/argostranslate/networking.py b/argostranslate/networking.py
@@ -3,7 +3,12 @@
 import random
 import urllib.request
 
+from spacy.cli import download as spacy_download
+from spacy import load as spacy_load
+from os import makedirs
+from pathlib import Path
 from argostranslate.utils import error, info
+from argostranslate import settings
 
 USER_AGENT = "ArgosTranslate"
 
@@ -30,7 +35,7 @@ def get_protocol(url: str) -> str | None:
 
 
 def get(url: str, retry_count: int = 3) -> bytes | None:
-    """Downloads data from a url and returns it
+    """Downloads data from an url and returns it
 
     Args:
         url: The url to download (http, https)
@@ -76,3 +81,23 @@ def get_from(urls: list[str], retry_count: int = 3) -> bytes | None:
         if attempt is not None:
             return attempt
     return None
+
+
+def cache_spacy():
+    """ Downloads spacy multilingual model and saves it the cache directory for further use"""
+    spacy_cache = Path(settings.cache_dir / "spacy")
+    makedirs(spacy_cache, exist_ok=True)
+    info("Looking for cached Spacy xx_sent_ud_sm.")
+    spacy_model = Path(spacy_cache / "senter" / "model")
+    if not spacy_model.exists():
+        try:
+            info("Downloading Spacy xx_sent_ud_sm.")
+            spacy_download("xx_sent_ud_sm")
+            nlp = spacy_load("xx_sent_ud_sm", exclude=["parser"])
+            nlp.to_disk(spacy_cache)
+            info("Spacy xx_sent_ud_sm successfully cached.")
+            return spacy_cache
+        except Exception as e:
+            error(str(e))
+            return None
+    else:   return spacy_cache
diff --git a/argostranslate/package.py b/argostranslate/package.py
@@ -59,7 +59,7 @@ class IPackage:
 
 
     Packages are a zip archive of a directory with metadata.json
-    in its root the .argosmodel file extension. By default a
+    in its root the .argosmodel file extension. By default, an
     OpenNMT CTranslate2 directory named model/ is expected in the root directory
     along with a sentencepiece model named sentencepiece.model or a bpe.model
     for tokenizing and Stanza data for sentence boundary detection.
@@ -197,6 +197,17 @@ def __init__(self, package_path: Path):
             metadata = json.load(metadata_file)
             self.load_metadata_from_json(metadata)
 
+        """ As of spacy multilingual support, the sbd package shall depend on the Argos package's content"""
+        stanza_package = package_path / "stanza"
+        spacy_package = package_path / "spacy"
+
+        if stanza_package.exists(): # Stanza tokenizer within the package
+            self.packaged_sbd_path = stanza_package
+        elif spacy_package.exists(): #Explicit/language-specific spacy model within the package
+            self.packaged_sbd_path = spacy_package
+        else: # None if no sbd package embedded in the argos package (will default to cache)
+            self.packaged_sbd_path = None
+
         sp_model_path = package_path / "sentencepiece.model"
         bpe_model_path = package_path / "bpe.model"
 
@@ -261,7 +272,7 @@ def __init__(self, metadata):
     def download(self) -> Path:
         """Downloads the AvailablePackage and returns its path"""
         filename = argospm_package_name(self) + ".argosmodel"
-
+        '''
         # Install sbd package if needed
         if self.type == "translate" and not settings.stanza_available:
             if (
@@ -275,7 +286,7 @@ def download(self) -> Path:
                 for sbd_package in sbd_packages:
                     download_path = sbd_package.download()
                     install_from_path(download_path)
-
+        '''
         filepath = settings.downloads_dir / filename
         if not filepath.exists():
             data = networking.get_from(self.links)
@@ -351,7 +362,7 @@ def get_available_packages() -> list[AvailablePackage]:
             for metadata in index:
                 package = AvailablePackage(metadata)
                 packages.append(package)
-
+            '''
             # If stanza not available filter for sbd available
             if not settings.stanza_available:
                 installed_and_available_packages = packages + get_installed_packages()
@@ -367,7 +378,7 @@ def get_available_packages() -> list[AvailablePackage]:
                     filter(lambda x: x.from_code in sbd_available_codes, packages)
                 )
                 return packages + sbd_packages
-
+            '''
             return packages
     except FileNotFoundError:
         update_package_index()

diff --git a/argostranslate/sbd.py b/argostranslate/sbd.py
@@ -1,44 +1,70 @@
 from __future__ import annotations
 
 from difflib import SequenceMatcher
-from typing import List, Optional
 
+from typing import List
+import stanza
 import spacy
 
-from argostranslate import package
+from argostranslate import package, settings
 from argostranslate.package import Package
 from argostranslate.utils import info
+from argostranslate.networking import cache_spacy
 
 
-class ISentenceBoundaryDetectionModel:
+class ISentenceBoundaryDetectionModel():
     # https://github.com/argosopentech/sbd/blob/main/main.py
-    def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]:
+    pkg: Package
+
+    def split_sentences(self, text: str) -> List[str]:
         raise NotImplementedError
 
 
 # Spacy sentence boundary detection Sentencizer
 # https://community.libretranslate.com/t/sentence-boundary-detection-for-machine-translation/606/3
 # https://spacy.io/usage/linguistic-features/#sbd
-
 # Download model:
 # python -m spacy download xx_sent_ud_sm
 class SpacySentencizerSmall(ISentenceBoundaryDetectionModel):
-    def __init__(self):
-        try:
-            self.nlp = spacy.load("xx_sent_ud_sm", exclude=["parser"])
-        except OSError:
-            # Automatically download the model if it doesn't exist
-            spacy.cli.download("xx_sent_ud_sm")
-            self.nlp = spacy.load("xx_sent_ud_sm", exclude=["parser"])
+    def __init__(self, pkg: Package):
+        '''
+        Packaging specific spacy when "xx_sent_ud_sm" doesn't cover the language improves performances over stanza.
+        Please use small models ".._core/web_sm" for consistency.
+        '''
+        if pkg.packaged_sbd_path is not None:
+            self.nlp = spacy.load(pkg.packaged_sbd_path, exclude=["parser"])
+        # Case sbd is not packaged, use cached Spacy multilingual (xx_ud_sent_sm)
+        else:
+            cached_spacy = cache_spacy()
+            self.nlp = spacy.load(cached_spacy, exclude=["parser"])
         self.nlp.add_pipe("sentencizer")
 
-    def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]:
+    def split_sentences(self, text: str) -> List[str]:
         doc = self.nlp(text)
         return [sent.text for sent in doc.sents]
 
     def __str__(self):
-        return "Spacy xx_sent_ud_sm"
+            return "Using Spacy model."
+
+# Stanza sentence boundary detection Sentencizer (legacy, but quite a few languages need it)
+class StanzaSentencizer(ISentenceBoundaryDetectionModel):
+    # Initializes the stanza pipeline, formerly coded in translate.py (commented lines 438-477)
+    # which is actually a tokenizer, hence the slow-motion when running it
+    def __init__(self, pkg: Package):
+         self.stanza_pipeline = stanza.Pipeline(
+            lang=pkg.from_code,
+            dir=str(pkg.packaged_sbd_path),
+            processors="tokenize",
+            use_gpu=settings.device == "cuda",
+            logging_level="WARNING",
+        )
+
+    def split_sentences(self, text: str) -> List[str]:
+        doc = self.stanza_pipeline(text)
+        return [sent.text for sent in doc.sentences]
 
+    def __str__(self):
+        return "Using Stanza library"
 
 # Few Shot Sentence Boundary Detection
 

diff --git a/argostranslate/settings.py b/argostranslate/settings.py
@@ -159,9 +159,10 @@ def set_setting(key: str, value):
 local_package_index = data_dir / "index.json"
 
 experimental_enabled = os.getenv("ARGOS_EXPERIMENTAL_ENABLED") in TRUE_VALUES
-
+'''
+# Legacy environment variable for sentence boundary detection
 stanza_available = os.getenv("ARGOS_STANZA_AVAILABLE") in (TRUE_VALUES + [None])
-
+'''
 # Supported values: "cpu" and "cuda"
 device = get_setting("ARGOS_DEVICE_TYPE", "cpu")
 

diff --git a/argostranslate/tokenizer.py b/argostranslate/tokenizer.py
@@ -27,9 +27,12 @@ def encode(self, sentence: str) -> List[str]:
         return tokens
 
     def decode(self, tokens: List[str]) -> str:
+        """
+        # Returns not decoded byte-fallback tokens, quite detrimental to Asian languages translations
         detokenized = "".join(tokens)
         return detokenized.replace("▁", " ")
-
+        """
+        return self.lazy_processor().decode_pieces(tokens).replace("_", " ")
 
 class BPETokenizer(Tokenizer):
     def __init__(self, model_file: Path, from_code: str, to_code: str):
@@ -39,6 +42,7 @@ def __init__(self, model_file: Path, from_code: str, to_code: str):
         self.tokenizer = None
         self.detokenizer = None
         self.bpe_source = None
+        self.normalizer = None
 
     def lazy_load(self):
         if self.tokenizer is None:

diff --git a/argostranslate/translate.py b/argostranslate/translate.py
@@ -3,13 +3,13 @@
 from typing import List
 
 import ctranslate2
-import sentencepiece as spm
+# import sentencepiece as spm
 from ctranslate2 import Translator
 
 from argostranslate import apis, fewshot, package, sbd, settings
 from argostranslate.models import ILanguageModel
 from argostranslate.package import Package
-from argostranslate.sbd import SpacySentencizerSmall
+from argostranslate.sbd import SpacySentencizerSmall, StanzaSentencizer
 from argostranslate.utils import info
 
 
@@ -67,7 +67,7 @@ def hypotheses(self, input_text: str, num_hypotheses: int = 4) -> list[Hypothesi
 
         Args:
             input_text: The text to be translated.
-            num_hypotheses: Number of hypothetic results expected
+            num_hypotheses: Number of hypothetical results expected
 
         Returns:
             List of translation hypotheses
@@ -113,7 +113,7 @@ class Language:
 
     Attributes:
         code: The code representing the language.
-        name: The human readable name of the language.
+        name: The human-readable name of the language.
         translations_from: A list of the translations
             that translate from this language.
         translations_to: A list of the translations
@@ -160,7 +160,14 @@ def __init__(self, from_lang: Language, to_lang: Language, pkg: Package):
         self.to_lang = to_lang
         self.pkg = pkg
         self.translator = None
-        self.sentencizer = SpacySentencizerSmall()
+        if 'stanza' in str(pkg.packaged_sbd_path):
+            self.sentencizer = StanzaSentencizer(pkg)
+        elif pkg.packaged_sbd_path is None or 'spacy' in str(pkg.packaged_sbd_path):
+            self.sentencizer = SpacySentencizerSmall(pkg)
+        else:
+            # Any other SBD dependency should be defined as a class in the SBD module.
+            raise NotImplementedError()
+
 
     def hypotheses(self, input_text: str, num_hypotheses: int = 4) -> list[Hypothesis]:
         if self.translator is None:
@@ -180,8 +187,8 @@ def hypotheses(self, input_text: str, num_hypotheses: int = 4) -> list[Hypothesi
                     self.pkg,
                     paragraph,
                     self.translator,
-                    num_hypotheses,
                     self.sentencizer,
+                    num_hypotheses,
                 )
             )
         info("translated_paragraphs:", translated_paragraphs)
@@ -297,7 +304,7 @@ def hypotheses(self, input_text: str, num_hypotheses: int = 4) -> list[Hypothesi
         translated_paragraphs = []
         for paragraph in paragraphs:
             translated_paragraph = self.cache.get(paragraph)
-            # If len() of our cached items are different than `num_hypotheses` it means that
+            # If len() of our cached items are different from `num_hypotheses` it means that
             # the search parameter is changed by caller, so we can't re-use cache, and should update it.
             if (
                 translated_paragraph is None
@@ -407,27 +414,29 @@ def apply_packaged_translation(
     pkg: Package,
     input_text: str,
     translator: Translator,
+    sentencizer: sbd.ISentenceBoundaryDetectionModel,
     num_hypotheses: int = 4,
-    sentencizer: sbd.ISentenceBoundaryDetectionModel = SpacySentencizerSmall(),
 ) -> list[Hypothesis]:
     """Applies the translation in pkg to translate input_text.
 
     Args:
         pkg: The package that provides the translation.
         input_text: The text to be translated.
         translator: The CTranslate2 Translator
+        sentencizer: The Sentence Boundary Detection package
         num_hypotheses: The number of hypotheses to generate
 
     Returns:
-        A list of Hypothesis's for translating input_text
-
+        A list of Hypotheses objects for translated input_text.
     """
 
     info("apply_packaged_translation", input_text)
 
-    # Sentence boundary detection
+    #Sentence boundary detection
+    sentences = sentencizer.split_sentences(input_text)
+    info("sentences", sentences)
     """
-    # Argos Translate 1.9 Sentence Boundary Detection
+    # Argos Translate 1.9 Sentence Boundary Detection (legacy)
     if pkg.type == "sbd":
         sentences = [input_text]
     elif settings.stanza_available:
@@ -466,9 +475,6 @@ def apply_packaged_translation(
             info(input_text[start_index:sbd_index])
             start_index = sbd_index
     """
-    sentences = sentencizer.split_sentences(input_text)
-
-    info("sentences", sentences)
 
     # Tokenization
     tokenized = [pkg.tokenizer.encode(sentence) for sentence in sentences]
@@ -539,7 +545,8 @@ def get_installed_languages() -> list[Language]:
 
     if settings.model_provider == settings.ModelProvider.OPENNMT:
         packages = package.get_installed_packages()
-
+        '''
+        # Legacy sbd package search (environment-dependant)
         # If stanza not available filter for sbd available
         if not settings.stanza_available:
             sbd_packages = list(filter(lambda x: x.type == "sbd", packages))
@@ -549,7 +556,7 @@ def get_installed_languages() -> list[Language]:
             packages = list(
                 filter(lambda x: x.from_code in sbd_available_codes, packages)
             )
-
+        '''
         # Filter for translate packages
         packages = list(filter(lambda x: x.type == "translate", packages))
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 ctranslate2>=4.0,<5
 sentencepiece==0.2.0
 spacy
+stanza==1.1.1
 packaging
 sacremoses==0.0.53