Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MT tutorial #129

Merged
merged 3 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 4 additions & 13 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,24 +1,15 @@
{
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",
"source.organizeImports": "explicit"
},
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.analysis.extraPaths": [
"tests"
],
"python.analysis.extraPaths": ["tests"],
"python.analysis.importFormat": "relative",
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
},
"black-formatter.path": [
"poetry",
"run",
"black"
],
"python.analysis.extraPaths": [
"./tests"
]
}
"black-formatter.path": ["poetry", "run", "black"]
}
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ If you would like to find out more about how to use Machine, check out the tutor
- [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
- [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
- [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
- [Paratext/USFM Processing](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/paratext_usfm.ipynb)
- [Machine Translation](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/machine_translation.ipynb)
- [Paratext/USFM Processing](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/paratext_usfm.ipynb)
14 changes: 13 additions & 1 deletion machine/jobs/huggingface/hugging_face_nmt_model_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from pathlib import Path
from typing import Any, cast

import datasets.utils.logging as datasets_logging
import transformers.utils.logging as transformers_logging
from transformers import AutoConfig, AutoModelForSeq2SeqLM, HfArgumentParser, PreTrainedModel, Seq2SeqTrainingArguments
from transformers.integrations import ClearMLCallback
from transformers.tokenization_utils import TruncationStrategy
Expand Down Expand Up @@ -39,6 +41,16 @@ def __init__(self, config: Any) -> None:
):
self._training_args.report_to.remove("clearml")

# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers_logging.set_verbosity_info()

log_level = self._training_args.get_process_log_level()
logger.setLevel(log_level)
datasets_logging.set_verbosity(log_level)
transformers_logging.set_verbosity(log_level)
transformers_logging.enable_default_handler()
transformers_logging.enable_explicit_format()

@property
def train_tokenizer(self) -> bool:
return False
Expand Down Expand Up @@ -67,7 +79,7 @@ def create_model_trainer(self, corpus: ParallelTextCorpus) -> Trainer:
src_lang=self._config.src_lang,
tgt_lang=self._config.trg_lang,
add_unk_src_tokens=self._config.huggingface.tokenizer.add_unk_src_tokens,
add_unk_trg_tokens=self._config.huggingface.tokenizer.add_unk_trg_tokens,
add_unk_tgt_tokens=self._config.huggingface.tokenizer.add_unk_tgt_tokens,
)

def create_engine(self) -> TranslationEngine:
Expand Down
2 changes: 1 addition & 1 deletion machine/jobs/settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ default:
oom_batch_size_backoff_mult: 0.5
tokenizer:
add_unk_src_tokens: true
add_unk_trg_tokens: true
add_unk_tgt_tokens: true
thot_mt:
word_alignment_model_type: hmm
tokenizer: latin
Expand Down
2 changes: 1 addition & 1 deletion machine/translation/huggingface/hugging_face_nmt_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def __init__(
self._tokenizer = AutoTokenizer.from_pretrained(self._model.name_or_path, use_fast=True)
if isinstance(self._tokenizer, (NllbTokenizer, NllbTokenizerFast)):
self._mpn = MosesPunctNormalizer()
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions] # type: ignore
else:
self._mpn = None

Expand Down
47 changes: 17 additions & 30 deletions machine/translation/huggingface/hugging_face_nmt_model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
from pathlib import Path
from typing import Any, Callable, List, Optional, Union, cast

import datasets.utils.logging as datasets_logging
import torch # pyright: ignore[reportMissingImports]
import transformers.utils.logging as transformers_logging
from datasets.arrow_dataset import Dataset
from sacremoses import MosesPunctNormalizer
from torch import Tensor # pyright: ignore[reportMissingImports]
Expand Down Expand Up @@ -84,10 +82,10 @@ def __init__(
corpus: Union[ParallelTextCorpus, Dataset],
src_lang: Optional[str] = None,
tgt_lang: Optional[str] = None,
max_source_length: Optional[int] = None,
max_target_length: Optional[int] = None,
max_src_length: Optional[int] = None,
max_tgt_length: Optional[int] = None,
add_unk_src_tokens: bool = False,
add_unk_trg_tokens: bool = True,
add_unk_tgt_tokens: bool = True,
) -> None:
self._model = model
self._training_args = training_args
Expand All @@ -96,12 +94,12 @@ def __init__(
self._tgt_lang = tgt_lang
self._trainer: Optional[Seq2SeqTrainer] = None
self._metrics = {}
self.max_source_length = max_source_length
self.max_target_length = max_target_length
self.max_src_length = max_src_length
self.max_tgt_length = max_tgt_length
self._add_unk_src_tokens = add_unk_src_tokens
self._add_unk_trg_tokens = add_unk_trg_tokens
self._add_unk_tgt_tokens = add_unk_tgt_tokens
self._mpn = MosesPunctNormalizer()
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions] # type: ignore
self._stats = TrainStats()

@property
Expand All @@ -113,17 +111,6 @@ def train(
progress: Optional[Callable[[ProgressStatus], None]] = None,
check_canceled: Optional[Callable[[], None]] = None,
) -> None:
if self._training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers_logging.set_verbosity_info()

log_level = self._training_args.get_process_log_level()
logger.setLevel(log_level)
datasets_logging.set_verbosity(log_level)
transformers_logging.set_verbosity(log_level)
transformers_logging.enable_default_handler()
transformers_logging.enable_explicit_format()

last_checkpoint = None
if os.path.isdir(self._training_args.output_dir) and not self._training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(self._training_args.output_dir)
Expand Down Expand Up @@ -203,7 +190,7 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
logger.info(f"Added {len(missing_tokens)} tokens to the tokenizer: {missing_tokens}")
return AutoTokenizer.from_pretrained(str(tokenizer_dir), use_fast=True)

if self._add_unk_src_tokens or self._add_unk_trg_tokens:
if self._add_unk_src_tokens or self._add_unk_tgt_tokens:
logger.info("Checking for missing tokens")
if not isinstance(tokenizer, PreTrainedTokenizerFast):
logger.warning(
Expand All @@ -217,7 +204,7 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
)
# using unofficially supported behavior to set the normalizer
tokenizer.backend_tokenizer.normalizer = norm_tok.backend_tokenizer.normalizer # type: ignore
if self._add_unk_src_tokens and self._add_unk_trg_tokens:
if self._add_unk_src_tokens and self._add_unk_tgt_tokens:
lang_codes = [src_lang, tgt_lang]
elif self._add_unk_src_tokens:
lang_codes = [src_lang]
Expand Down Expand Up @@ -293,12 +280,12 @@ def add_lang_code_to_tokenizer(tokenizer: Any, lang_code: str):
if model.name_or_path.startswith("t5-") or model.name_or_path.startswith("google/mt5-"):
prefix = f"translate {self._src_lang} to {self._tgt_lang}: "

max_source_length = self.max_source_length
if max_source_length is None:
max_source_length = model.config.max_length
max_target_length = self.max_target_length
if max_target_length is None:
max_target_length = model.config.max_length
max_src_length = self.max_src_length
if max_src_length is None:
max_src_length = model.config.max_length
max_tgt_length = self.max_tgt_length
if max_tgt_length is None:
max_tgt_length = model.config.max_length

if self._training_args.label_smoothing_factor > 0 and not hasattr(
model, "prepare_decoder_input_ids_from_labels"
Expand All @@ -317,9 +304,9 @@ def preprocess_function(examples):
inputs = [prefix + ex[src_lang] for ex in examples["translation"]]
targets = [ex[tgt_lang] for ex in examples["translation"]]

model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True)
model_inputs = tokenizer(inputs, max_length=max_src_length, truncation=True)
# Tokenize targets with the `text_target` keyword argument
labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
labels = tokenizer(text_target=targets, max_length=max_tgt_length, truncation=True)

model_inputs["labels"] = labels["input_ids"]
return model_inputs
Expand Down
14 changes: 13 additions & 1 deletion machine/translation/translation_suggester.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from abc import ABC, abstractmethod
from typing import Iterable, Sequence
from typing import Iterable, Optional, Sequence

from .interactive_translator import InteractiveTranslator
from .translation_result import TranslationResult
from .translation_suggestion import TranslationSuggestion
from .truecaser import Truecaser


class TranslationSuggester(ABC):
Expand All @@ -14,3 +16,13 @@ def __init__(self, confidence_threshold: float = 0, break_on_punctuation: bool =
def get_suggestions(
self, n: int, prefix_count: int, is_last_word_complete: bool, results: Iterable[TranslationResult]
) -> Sequence[TranslationSuggestion]: ...

def get_suggestions_from_translator(
self, n: int, translator: InteractiveTranslator, truecaser: Optional[Truecaser] = None
) -> Sequence[TranslationSuggestion]:
results = translator.get_current_results()
if truecaser is not None:
results = (
truecaser.truecase_translation_result(result, translator.target_detokenizer) for result in results
)
return self.get_suggestions(n, len(translator.prefix_word_ranges), translator.is_last_word_complete, results)
21 changes: 20 additions & 1 deletion machine/translation/truecaser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from abc import ABC, abstractmethod
from typing import Sequence
from typing import Optional, Sequence

from ..corpora.text_corpus import TextCorpus
from ..tokenization.detokenizer import Detokenizer
from ..tokenization.whitespace_detokenizer import WHITESPACE_DETOKENIZER
from .trainer import Trainer
from .translation_result import TranslationResult


class Truecaser(ABC):
Expand All @@ -15,5 +18,21 @@ def train_segment(self, segment: Sequence[str], sentence_start: bool = True) ->
@abstractmethod
def truecase(self, segment: Sequence[str]) -> Sequence[str]: ...

def truecase_translation_result(
self, result: TranslationResult, detokenizer: Optional[Detokenizer] = None
) -> TranslationResult:
if detokenizer is None:
detokenizer = WHITESPACE_DETOKENIZER
target_tokens = self.truecase(result.target_tokens)
return TranslationResult(
detokenizer.detokenize(target_tokens),
result.source_tokens,
target_tokens,
result.confidences,
result.sources,
result.alignment,
result.phrases,
)

@abstractmethod
def save(self) -> None: ...
37 changes: 36 additions & 1 deletion samples/corpora.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -371,6 +371,41 @@
" print(f\"{row.ref}: {row.text}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can extract non-Scripture portions of the project as well, such as introductory material, footnotes, section headers, etc. This feature is enabled by setting the `include_all_text` flag. Machine uses a special Scripture reference for uniquely indentifying all text segments in a Scripture book. Each text segment is referenced by its position relative to a verse and its marker."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1JN 1:0/1:ide: UTF-8\n",
"1JN 1:0/2:h: 1 John\n",
"1JN 1:0/3:toc1: John’s First Letter\n",
"1JN 1:0/4:toc2: 1 John\n",
"1JN 1:0/5:toc3: 1 John\n",
"1JN 1:0/6:mt1: John’s First Letter\n",
"1JN 1:1: That which was from the beginning, that which we have heard, that which we have seen with our eyes, that which we saw, and our hands touched, concerning the Word of life\n",
"1JN 1:2: (and the life was revealed, and we have seen, and testify, and declare to you the life, the eternal life, which was with the Father, and was revealed to us);\n",
"1JN 1:3: that which we have seen and heard we declare to you, that you also may have fellowship with us. Yes, and our fellowship is with the Father and with his Son, Jesus Christ.\n",
"1JN 1:3/1:f: 1:3 “Christ” means “Anointed One”.\n"
]
}
],
"source": [
"corpus = ParatextTextCorpus(\"data/WEB-PT\", include_all_text=True)\n",
"for row in corpus.take(10):\n",
" print(f\"{row.ref}: {row.text}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
29 changes: 29 additions & 0 deletions samples/data/smt.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Translation model prefix
-tm tm/src_trg

# Language model
-lm lm/trg.lm

# W parameter (maximum number of translation options to be considered per each source phrase)
-W 10

# S parameter (maximum number of hypotheses that can be stored in each stack)
-S 10

# A parameter (Maximum length in words of the source phrases to be translated)
-A 7

# Degree of non-monotonicity
-nomon 0

# Heuristic function used
-h 6

# Best-first search flag
-be

# Translation model weights
-tmw 0 0.5 1 1 1 1 0 1

# Set online learning parameters (ol_alg, lr_policy, l_stepsize, em_iters, e_par, r_par)
-olp 0 0 1 5 1 0
Loading
Loading