Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Versatile Sentence Boundary Detection #460

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion argostranslate/networking.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
import random
import urllib.request

from spacy.cli import download as spacy_download
from spacy import load as spacy_load
from os import makedirs
from pathlib import Path
from argostranslate.utils import error, info
from argostranslate import settings

USER_AGENT = "ArgosTranslate"

Expand All @@ -30,7 +35,7 @@ def get_protocol(url: str) -> str | None:


def get(url: str, retry_count: int = 3) -> bytes | None:
"""Downloads data from a url and returns it
"""Downloads data from an url and returns it

Args:
url: The url to download (http, https)
Expand Down Expand Up @@ -76,3 +81,23 @@ def get_from(urls: list[str], retry_count: int = 3) -> bytes | None:
if attempt is not None:
return attempt
return None


def cache_spacy():
""" Downloads spacy multilingual model and saves it the cache directory for further use"""
spacy_cache = Path(settings.cache_dir / "spacy")
makedirs(spacy_cache, exist_ok=True)
info("Looking for cached Spacy xx_sent_ud_sm.")
spacy_model = Path(spacy_cache / "senter" / "model")
if not spacy_model.exists():
try:
info("Downloading Spacy xx_sent_ud_sm.")
spacy_download("xx_sent_ud_sm")
nlp = spacy_load("xx_sent_ud_sm", exclude=["parser"])
nlp.to_disk(spacy_cache)
info("Spacy xx_sent_ud_sm successfully cached.")
return spacy_cache
except Exception as e:
error(str(e))
return None
else: return spacy_cache
21 changes: 16 additions & 5 deletions argostranslate/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class IPackage:


Packages are a zip archive of a directory with metadata.json
in its root the .argosmodel file extension. By default a
in its root the .argosmodel file extension. By default, an
OpenNMT CTranslate2 directory named model/ is expected in the root directory
along with a sentencepiece model named sentencepiece.model or a bpe.model
for tokenizing and Stanza data for sentence boundary detection.
Expand Down Expand Up @@ -197,6 +197,17 @@ def __init__(self, package_path: Path):
metadata = json.load(metadata_file)
self.load_metadata_from_json(metadata)

""" As of spacy multilingual support, the sbd package shall depend on the Argos package's content"""
stanza_package = package_path / "stanza"
spacy_package = package_path / "spacy"

if stanza_package.exists(): # Stanza tokenizer within the package
self.packaged_sbd_path = stanza_package
elif spacy_package.exists(): #Explicit/language-specific spacy model within the package
self.packaged_sbd_path = spacy_package
else: # None if no sbd package embedded in the argos package (will default to cache)
self.packaged_sbd_path = None

sp_model_path = package_path / "sentencepiece.model"
bpe_model_path = package_path / "bpe.model"

Expand Down Expand Up @@ -261,7 +272,7 @@ def __init__(self, metadata):
def download(self) -> Path:
"""Downloads the AvailablePackage and returns its path"""
filename = argospm_package_name(self) + ".argosmodel"

'''
# Install sbd package if needed
if self.type == "translate" and not settings.stanza_available:
if (
Expand All @@ -275,7 +286,7 @@ def download(self) -> Path:
for sbd_package in sbd_packages:
download_path = sbd_package.download()
install_from_path(download_path)

'''
filepath = settings.downloads_dir / filename
if not filepath.exists():
data = networking.get_from(self.links)
Expand Down Expand Up @@ -351,7 +362,7 @@ def get_available_packages() -> list[AvailablePackage]:
for metadata in index:
package = AvailablePackage(metadata)
packages.append(package)

'''
# If stanza not available filter for sbd available
if not settings.stanza_available:
installed_and_available_packages = packages + get_installed_packages()
Expand All @@ -367,7 +378,7 @@ def get_available_packages() -> list[AvailablePackage]:
filter(lambda x: x.from_code in sbd_available_codes, packages)
)
return packages + sbd_packages

'''
return packages
except FileNotFoundError:
update_package_index()
Expand Down
54 changes: 40 additions & 14 deletions argostranslate/sbd.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,70 @@
from __future__ import annotations

from difflib import SequenceMatcher
from typing import List, Optional

from typing import List
import stanza
import spacy

from argostranslate import package
from argostranslate import package, settings
from argostranslate.package import Package
from argostranslate.utils import info
from argostranslate.networking import cache_spacy


class ISentenceBoundaryDetectionModel:
class ISentenceBoundaryDetectionModel():
# https://github.com/argosopentech/sbd/blob/main/main.py
def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]:
pkg: Package

def split_sentences(self, text: str) -> List[str]:
raise NotImplementedError


# Spacy sentence boundary detection Sentencizer
# https://community.libretranslate.com/t/sentence-boundary-detection-for-machine-translation/606/3
# https://spacy.io/usage/linguistic-features/#sbd

# Download model:
# python -m spacy download xx_sent_ud_sm
class SpacySentencizerSmall(ISentenceBoundaryDetectionModel):
def __init__(self):
try:
self.nlp = spacy.load("xx_sent_ud_sm", exclude=["parser"])
except OSError:
# Automatically download the model if it doesn't exist
spacy.cli.download("xx_sent_ud_sm")
self.nlp = spacy.load("xx_sent_ud_sm", exclude=["parser"])
def __init__(self, pkg: Package):
'''
Packaging specific spacy when "xx_sent_ud_sm" doesn't cover the language improves performances over stanza.
Please use small models ".._core/web_sm" for consistency.
'''
if pkg.packaged_sbd_path is not None:
self.nlp = spacy.load(pkg.packaged_sbd_path, exclude=["parser"])
# Case sbd is not packaged, use cached Spacy multilingual (xx_ud_sent_sm)
else:
cached_spacy = cache_spacy()
self.nlp = spacy.load(cached_spacy, exclude=["parser"])
self.nlp.add_pipe("sentencizer")

def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]:
def split_sentences(self, text: str) -> List[str]:
doc = self.nlp(text)
return [sent.text for sent in doc.sents]

def __str__(self):
return "Spacy xx_sent_ud_sm"
return "Using Spacy model."

# Stanza sentence boundary detection Sentencizer (legacy, but quite a few languages need it)
class StanzaSentencizer(ISentenceBoundaryDetectionModel):
# Initializes the stanza pipeline, formerly coded in translate.py (commented lines 438-477)
# which is actually a tokenizer, hence the slow-motion when running it
def __init__(self, pkg: Package):
self.stanza_pipeline = stanza.Pipeline(
lang=pkg.from_code,
dir=str(pkg.packaged_sbd_path),
processors="tokenize",
use_gpu=settings.device == "cuda",
logging_level="WARNING",
)

def split_sentences(self, text: str) -> List[str]:
doc = self.stanza_pipeline(text)
return [sent.text for sent in doc.sentences]

def __str__(self):
return "Using Stanza library"

# Few Shot Sentence Boundary Detection

Expand Down
5 changes: 3 additions & 2 deletions argostranslate/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,10 @@ def set_setting(key: str, value):
local_package_index = data_dir / "index.json"

experimental_enabled = os.getenv("ARGOS_EXPERIMENTAL_ENABLED") in TRUE_VALUES

'''
# Legacy environment variable for sentence boundary detection
stanza_available = os.getenv("ARGOS_STANZA_AVAILABLE") in (TRUE_VALUES + [None])

'''
# Supported values: "cpu" and "cuda"
device = get_setting("ARGOS_DEVICE_TYPE", "cpu")

Expand Down
6 changes: 5 additions & 1 deletion argostranslate/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,12 @@ def encode(self, sentence: str) -> List[str]:
return tokens

def decode(self, tokens: List[str]) -> str:
"""
# Returns not decoded byte-fallback tokens, quite detrimental to Asian languages translations
detokenized = "".join(tokens)
return detokenized.replace("▁", " ")

"""
return self.lazy_processor().decode_pieces(tokens).replace("_", " ")

class BPETokenizer(Tokenizer):
def __init__(self, model_file: Path, from_code: str, to_code: str):
Expand All @@ -39,6 +42,7 @@ def __init__(self, model_file: Path, from_code: str, to_code: str):
self.tokenizer = None
self.detokenizer = None
self.bpe_source = None
self.normalizer = None

def lazy_load(self):
if self.tokenizer is None:
Expand Down
41 changes: 24 additions & 17 deletions argostranslate/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
from typing import List

import ctranslate2
import sentencepiece as spm
# import sentencepiece as spm
from ctranslate2 import Translator

from argostranslate import apis, fewshot, package, sbd, settings
from argostranslate.models import ILanguageModel
from argostranslate.package import Package
from argostranslate.sbd import SpacySentencizerSmall
from argostranslate.sbd import SpacySentencizerSmall, StanzaSentencizer
from argostranslate.utils import info


Expand Down Expand Up @@ -67,7 +67,7 @@ def hypotheses(self, input_text: str, num_hypotheses: int = 4) -> list[Hypothesi

Args:
input_text: The text to be translated.
num_hypotheses: Number of hypothetic results expected
num_hypotheses: Number of hypothetical results expected

Returns:
List of translation hypotheses
Expand Down Expand Up @@ -113,7 +113,7 @@ class Language:

Attributes:
code: The code representing the language.
name: The human readable name of the language.
name: The human-readable name of the language.
translations_from: A list of the translations
that translate from this language.
translations_to: A list of the translations
Expand Down Expand Up @@ -160,7 +160,14 @@ def __init__(self, from_lang: Language, to_lang: Language, pkg: Package):
self.to_lang = to_lang
self.pkg = pkg
self.translator = None
self.sentencizer = SpacySentencizerSmall()
if 'stanza' in str(pkg.packaged_sbd_path):
self.sentencizer = StanzaSentencizer(pkg)
elif pkg.packaged_sbd_path is None or 'spacy' in str(pkg.packaged_sbd_path):
self.sentencizer = SpacySentencizerSmall(pkg)
else:
# Any other SBD dependency should be defined as a class in the SBD module.
raise NotImplementedError()


def hypotheses(self, input_text: str, num_hypotheses: int = 4) -> list[Hypothesis]:
if self.translator is None:
Expand All @@ -180,8 +187,8 @@ def hypotheses(self, input_text: str, num_hypotheses: int = 4) -> list[Hypothesi
self.pkg,
paragraph,
self.translator,
num_hypotheses,
self.sentencizer,
num_hypotheses,
)
)
info("translated_paragraphs:", translated_paragraphs)
Expand Down Expand Up @@ -297,7 +304,7 @@ def hypotheses(self, input_text: str, num_hypotheses: int = 4) -> list[Hypothesi
translated_paragraphs = []
for paragraph in paragraphs:
translated_paragraph = self.cache.get(paragraph)
# If len() of our cached items are different than `num_hypotheses` it means that
# If len() of our cached items are different from `num_hypotheses` it means that
# the search parameter is changed by caller, so we can't re-use cache, and should update it.
if (
translated_paragraph is None
Expand Down Expand Up @@ -407,27 +414,29 @@ def apply_packaged_translation(
pkg: Package,
input_text: str,
translator: Translator,
sentencizer: sbd.ISentenceBoundaryDetectionModel,
num_hypotheses: int = 4,
sentencizer: sbd.ISentenceBoundaryDetectionModel = SpacySentencizerSmall(),
) -> list[Hypothesis]:
"""Applies the translation in pkg to translate input_text.

Args:
pkg: The package that provides the translation.
input_text: The text to be translated.
translator: The CTranslate2 Translator
sentencizer: The Sentence Boundary Detection package
num_hypotheses: The number of hypotheses to generate

Returns:
A list of Hypothesis's for translating input_text

A list of Hypotheses objects for translated input_text.
"""

info("apply_packaged_translation", input_text)

# Sentence boundary detection
#Sentence boundary detection
sentences = sentencizer.split_sentences(input_text)
info("sentences", sentences)
"""
# Argos Translate 1.9 Sentence Boundary Detection
# Argos Translate 1.9 Sentence Boundary Detection (legacy)
if pkg.type == "sbd":
sentences = [input_text]
elif settings.stanza_available:
Expand Down Expand Up @@ -466,9 +475,6 @@ def apply_packaged_translation(
info(input_text[start_index:sbd_index])
start_index = sbd_index
"""
sentences = sentencizer.split_sentences(input_text)

info("sentences", sentences)

# Tokenization
tokenized = [pkg.tokenizer.encode(sentence) for sentence in sentences]
Expand Down Expand Up @@ -539,7 +545,8 @@ def get_installed_languages() -> list[Language]:

if settings.model_provider == settings.ModelProvider.OPENNMT:
packages = package.get_installed_packages()

'''
# Legacy sbd package search (environment-dependant)
# If stanza not available filter for sbd available
if not settings.stanza_available:
sbd_packages = list(filter(lambda x: x.type == "sbd", packages))
Expand All @@ -549,7 +556,7 @@ def get_installed_languages() -> list[Language]:
packages = list(
filter(lambda x: x.from_code in sbd_available_codes, packages)
)

'''
# Filter for translate packages
packages = list(filter(lambda x: x.type == "translate", packages))

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
ctranslate2>=4.0,<5
sentencepiece==0.2.0
spacy
stanza==1.1.1
packaging
sacremoses==0.0.53