From d13174b811c9ff8295809de7870c9a12edb66463 Mon Sep 17 00:00:00 2001 From: paul Date: Sun, 22 Oct 2023 16:08:08 +0100 Subject: [PATCH 1/7] Remove 'tokenize' argument from initialize_encoder function --- laser_encoders/download_models.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/laser_encoders/download_models.py b/laser_encoders/download_models.py index 452501d3..f4920b93 100644 --- a/laser_encoders/download_models.py +++ b/laser_encoders/download_models.py @@ -126,7 +126,6 @@ def initialize_encoder( model_dir: str = None, spm: bool = True, laser: str = None, - tokenize: bool = False, ): downloader = LaserModelDownloader(model_dir) if laser is not None: @@ -157,17 +156,13 @@ def initialize_encoder( model_dir = downloader.model_dir model_path = os.path.join(model_dir, f"{file_path}.pt") spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab") - spm_model = None + if not os.path.exists(spm_vocab): # if there is no cvocab for the laser3 lang use laser2 cvocab spm_vocab = os.path.join(model_dir, "laser2.cvocab") - if tokenize: - spm_model = os.path.join(model_dir, f"{file_path}.spm") - if not os.path.exists(spm_model): - spm_model = os.path.join(model_dir, "laser2.spm") - + return SentenceEncoder( - model_path=model_path, spm_vocab=spm_vocab, spm_model=spm_model + model_path=model_path, spm_vocab=spm_vocab, spm_model=None ) From 9b6f9cd48ce67874b3b46c5abd582f0b98fb2c90 Mon Sep 17 00:00:00 2001 From: paul Date: Sun, 22 Oct 2023 16:33:42 +0100 Subject: [PATCH 2/7] Add LaserEncoderPipeline for streamlined tokenization and encoding --- laser_encoders/download_models.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/laser_encoders/download_models.py b/laser_encoders/download_models.py index f4920b93..0a19bca2 100644 --- a/laser_encoders/download_models.py +++ b/laser_encoders/download_models.py @@ -198,6 +198,25 @@ def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = N return LaserTokenizer(spm_model=Path(model_path)) +class LaserEncoderPipeline: + def __init__(self, lang: str, model_dir: str = None, spm: bool = True, laser: str = None): + + self.tokenizer = initialize_tokenizer(lang=lang, model_dir=model_dir, laser=laser) + self.encoder = initialize_encoder(lang=lang, model_dir=model_dir, spm=spm,laser=laser) + + def encode_sentences(self, sentences: list) -> list: + """ + Tokenizes and encodes a list of sentences. + + Args: + - sentences (list of str): List of sentences to tokenize and encode. + + Returns: + - List of embeddings for each sentence. + """ + tokenized_sentences = [self.tokenizer.tokenize(sentence) for sentence in sentences] + return self.encoder.encode_sentences(tokenized_sentences) + if __name__ == "__main__": parser = argparse.ArgumentParser(description="LASER: Download Laser models") parser.add_argument( From 049f2e24793a3561e60b7cefc247b5d665576ff1 Mon Sep 17 00:00:00 2001 From: paul Date: Sun, 22 Oct 2023 16:57:59 +0100 Subject: [PATCH 3/7] docs: Update README to show use of LaserEncoderPipeline --- laser_encoders/README.md | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/laser_encoders/README.md b/laser_encoders/README.md index a85cdef5..3020ea6b 100644 --- a/laser_encoders/README.md +++ b/laser_encoders/README.md @@ -25,10 +25,21 @@ You can install laser_encoders using pip: ## Usage -Here's a simple example of how you can download and initialise the tokenizer and encoder with just one step. +Here's a simple example on how to obtain embeddings for sentences using the `LaserEncoderPipeline`: -**Note:** By default, the models will be downloaded to the `~/.cache/laser_encoders` directory. To specify a different download location, you can provide the argument `model_dir=path/to/model/directory` to the initialize_tokenizer and initialize_encoder functions +>**Note:** By default, the models will be downloaded to the `~/.cache/laser_encoders` directory. To specify a different download location, you can provide the argument `model_dir=path/to/model/directory` +```py +from laser_encoders import LaserEncoderPipeline + +# Initialize the LASER encoder pipeline +encoder = LaserEncoderPipeline(lang="igbo") + +# Encode sentences into embeddings +embeddings = encoder.encode_sentences(["nnọọ, kedu ka ị mere"]) +``` + +If you prefer more control over the tokenization and encoding process, you can initialize the tokenizer and encoder separately: ```py from laser_encoders import initialize_encoder, initialize_tokenizer @@ -39,16 +50,10 @@ tokenized_sentence = tokenizer.tokenize("nnọọ, kedu ka ị mere") # Initialize the LASER sentence encoder encoder = initialize_encoder(lang="igbo") -# Encode sentences into embeddings +# Encode tokenized sentences into embeddings embeddings = encoder.encode_sentences([tokenized_sentence]) ``` - -When initializing the encoder, you have the option to enable both tokenization and encoding by setting the `tokenize` flag to `True`. Below is an example of how to use it: -```py -encoder = initialize_encoder(lang="igbo", spm=True, tokenize=True) -embeddings = encoder("nnọọ, kedu ka ị mere") -``` ->setting the `spm` flag to `True` tells the encoder to also download the accompanying spm model +>By default, the `spm` flag is set to `True` when initializing the encoder, ensuring the accompanying spm model is downloaded. **Supported Languages:** You can specify any language from the [FLORES200](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) dataset. This includes both languages identified by their full codes (like "ibo_Latn") and simpler alternatives (like "igbo"). From 67ba8bb3304d797d6e4d36bc36be7e2a6f379d6f Mon Sep 17 00:00:00 2001 From: paul Date: Sun, 22 Oct 2023 17:04:58 +0100 Subject: [PATCH 4/7] style: Reformat code using black --- laser_encoders/__init__.py | 6 +++++- laser_encoders/download_models.py | 26 ++++++++++++++++---------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/laser_encoders/__init__.py b/laser_encoders/__init__.py index 75264c55..bd01969b 100644 --- a/laser_encoders/__init__.py +++ b/laser_encoders/__init__.py @@ -12,4 +12,8 @@ # # ------------------------------------------------------- -from laser_encoders.download_models import initialize_encoder, initialize_tokenizer +from laser_encoders.download_models import ( + LaserEncoderPipeline, + initialize_encoder, + initialize_tokenizer, +) diff --git a/laser_encoders/download_models.py b/laser_encoders/download_models.py index 0a19bca2..0f585f2f 100644 --- a/laser_encoders/download_models.py +++ b/laser_encoders/download_models.py @@ -160,10 +160,8 @@ def initialize_encoder( if not os.path.exists(spm_vocab): # if there is no cvocab for the laser3 lang use laser2 cvocab spm_vocab = os.path.join(model_dir, "laser2.cvocab") - - return SentenceEncoder( - model_path=model_path, spm_vocab=spm_vocab, spm_model=None - ) + + return SentenceEncoder(model_path=model_path, spm_vocab=spm_vocab, spm_model=None) def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None): @@ -199,24 +197,32 @@ def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = N class LaserEncoderPipeline: - def __init__(self, lang: str, model_dir: str = None, spm: bool = True, laser: str = None): + def __init__( + self, lang: str, model_dir: str = None, spm: bool = True, laser: str = None + ): + self.tokenizer = initialize_tokenizer( + lang=lang, model_dir=model_dir, laser=laser + ) + self.encoder = initialize_encoder( + lang=lang, model_dir=model_dir, spm=spm, laser=laser + ) - self.tokenizer = initialize_tokenizer(lang=lang, model_dir=model_dir, laser=laser) - self.encoder = initialize_encoder(lang=lang, model_dir=model_dir, spm=spm,laser=laser) - def encode_sentences(self, sentences: list) -> list: """ Tokenizes and encodes a list of sentences. - + Args: - sentences (list of str): List of sentences to tokenize and encode. Returns: - List of embeddings for each sentence. """ - tokenized_sentences = [self.tokenizer.tokenize(sentence) for sentence in sentences] + tokenized_sentences = [ + self.tokenizer.tokenize(sentence) for sentence in sentences + ] return self.encoder.encode_sentences(tokenized_sentences) + if __name__ == "__main__": parser = argparse.ArgumentParser(description="LASER: Download Laser models") parser.add_argument( From a8efad2e07de41ce8f7851166ca47abf4f4d699c Mon Sep 17 00:00:00 2001 From: paul Date: Tue, 24 Oct 2023 15:08:13 +0100 Subject: [PATCH 5/7] refactor: move encoder and tokenizer initialization into repective files --- laser_encoders/__init__.py | 7 +- laser_encoders/download_models.py | 104 ------------------------------ laser_encoders/laser_tokenizer.py | 36 +++++++++++ laser_encoders/models.py | 74 ++++++++++++++++++++- 4 files changed, 111 insertions(+), 110 deletions(-) diff --git a/laser_encoders/__init__.py b/laser_encoders/__init__.py index bd01969b..05b46186 100644 --- a/laser_encoders/__init__.py +++ b/laser_encoders/__init__.py @@ -12,8 +12,5 @@ # # ------------------------------------------------------- -from laser_encoders.download_models import ( - LaserEncoderPipeline, - initialize_encoder, - initialize_tokenizer, -) +from laser_encoders.laser_tokenizer import initialize_tokenizer +from laser_encoders.models import LaserEncoderPipeline, initialize_encoder diff --git a/laser_encoders/download_models.py b/laser_encoders/download_models.py index 0f585f2f..1167d7c1 100644 --- a/laser_encoders/download_models.py +++ b/laser_encoders/download_models.py @@ -26,8 +26,6 @@ from tqdm import tqdm from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE -from laser_encoders.laser_tokenizer import LaserTokenizer -from laser_encoders.models import SentenceEncoder logging.basicConfig( stream=sys.stdout, @@ -121,108 +119,6 @@ def main(self, args): ) -def initialize_encoder( - lang: str = None, - model_dir: str = None, - spm: bool = True, - laser: str = None, -): - downloader = LaserModelDownloader(model_dir) - if laser is not None: - if laser == "laser3": - lang = downloader.get_language_code(LASER3_LANGUAGE, lang) - downloader.download_laser3(lang=lang, spm=spm) - file_path = f"laser3-{lang}.v1" - elif laser == "laser2": - downloader.download_laser2() - file_path = "laser2" - else: - raise ValueError( - f"Unsupported laser model: {laser}. Choose either laser2 or laser3." - ) - else: - lang = downloader.get_language_code(LASER3_LANGUAGE, lang) - if lang in LASER3_LANGUAGE: - downloader.download_laser3(lang=lang, spm=spm) - file_path = f"laser3-{lang}.v1" - elif lang in LASER2_LANGUAGE: - downloader.download_laser2() - file_path = "laser2" - else: - raise ValueError( - f"Unsupported language name: {lang}. Please specify a supported language name." - ) - - model_dir = downloader.model_dir - model_path = os.path.join(model_dir, f"{file_path}.pt") - spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab") - - if not os.path.exists(spm_vocab): - # if there is no cvocab for the laser3 lang use laser2 cvocab - spm_vocab = os.path.join(model_dir, "laser2.cvocab") - - return SentenceEncoder(model_path=model_path, spm_vocab=spm_vocab, spm_model=None) - - -def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None): - downloader = LaserModelDownloader(model_dir) - if laser is not None: - if laser == "laser3": - lang = downloader.get_language_code(LASER3_LANGUAGE, lang) - if lang in SPM_LANGUAGE: - filename = f"laser3-{lang}.v1.spm" - else: - filename = "laser2.spm" - elif laser == "laser2": - filename = "laser2.spm" - else: - raise ValueError( - f"Unsupported laser model: {laser}. Choose either laser2 or laser3." - ) - else: - if lang in LASER3_LANGUAGE or lang in LASER2_LANGUAGE: - lang = downloader.get_language_code(LASER3_LANGUAGE, lang) - if lang in SPM_LANGUAGE: - filename = f"laser3-{lang}.v1.spm" - else: - filename = "laser2.spm" - else: - raise ValueError( - f"Unsupported language name: {lang}. Please specify a supported language name." - ) - - downloader.download(filename) - model_path = os.path.join(downloader.model_dir, filename) - return LaserTokenizer(spm_model=Path(model_path)) - - -class LaserEncoderPipeline: - def __init__( - self, lang: str, model_dir: str = None, spm: bool = True, laser: str = None - ): - self.tokenizer = initialize_tokenizer( - lang=lang, model_dir=model_dir, laser=laser - ) - self.encoder = initialize_encoder( - lang=lang, model_dir=model_dir, spm=spm, laser=laser - ) - - def encode_sentences(self, sentences: list) -> list: - """ - Tokenizes and encodes a list of sentences. - - Args: - - sentences (list of str): List of sentences to tokenize and encode. - - Returns: - - List of embeddings for each sentence. - """ - tokenized_sentences = [ - self.tokenizer.tokenize(sentence) for sentence in sentences - ] - return self.encoder.encode_sentences(tokenized_sentences) - - if __name__ == "__main__": parser = argparse.ArgumentParser(description="LASER: Download Laser models") parser.add_argument( diff --git a/laser_encoders/laser_tokenizer.py b/laser_encoders/laser_tokenizer.py index c180844b..0488cb2c 100644 --- a/laser_encoders/laser_tokenizer.py +++ b/laser_encoders/laser_tokenizer.py @@ -16,6 +16,7 @@ import gzip import logging +import os import re import sys from pathlib import Path @@ -24,6 +25,9 @@ import sentencepiece as spm from sacremoses import MosesDetokenizer, MosesPunctNormalizer +from laser_encoders.download_models import LaserModelDownloader +from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE, SPM_LANGUAGE + SPACE_NORMALIZER = re.compile(r"\s+") logging.basicConfig( @@ -131,3 +135,35 @@ def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]: ids.extend(token_ids) return ids + + +def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None): + downloader = LaserModelDownloader(model_dir) + if laser is not None: + if laser == "laser3": + lang = downloader.get_language_code(LASER3_LANGUAGE, lang) + if lang in SPM_LANGUAGE: + filename = f"laser3-{lang}.v1.spm" + else: + filename = "laser2.spm" + elif laser == "laser2": + filename = "laser2.spm" + else: + raise ValueError( + f"Unsupported laser model: {laser}. Choose either laser2 or laser3." + ) + else: + if lang in LASER3_LANGUAGE or lang in LASER2_LANGUAGE: + lang = downloader.get_language_code(LASER3_LANGUAGE, lang) + if lang in SPM_LANGUAGE: + filename = f"laser3-{lang}.v1.spm" + else: + filename = "laser2.spm" + else: + raise ValueError( + f"Unsupported language name: {lang}. Please specify a supported language name." + ) + + downloader.download(filename) + model_path = os.path.join(downloader.model_dir, filename) + return LaserTokenizer(spm_model=Path(model_path)) diff --git a/laser_encoders/models.py b/laser_encoders/models.py index e2a81ef9..69b6633c 100644 --- a/laser_encoders/models.py +++ b/laser_encoders/models.py @@ -14,6 +14,7 @@ import logging +import os import re import sys from collections import namedtuple @@ -26,7 +27,9 @@ from fairseq.models.transformer import Embedding, TransformerEncoder from fairseq.modules import LayerNorm -from laser_encoders.laser_tokenizer import LaserTokenizer +from laser_encoders.download_models import LaserModelDownloader +from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE +from laser_encoders.laser_tokenizer import LaserTokenizer, initialize_tokenizer SPACE_NORMALIZER = re.compile(r"\s+") Batch = namedtuple("Batch", "srcs tokens lengths") @@ -325,3 +328,72 @@ def combine_bidir(outs): if encoder_padding_mask.any() else None, } + +def initialize_encoder( + lang: str = None, + model_dir: str = None, + spm: bool = True, + laser: str = None, +): + downloader = LaserModelDownloader(model_dir) + if laser is not None: + if laser == "laser3": + lang = downloader.get_language_code(LASER3_LANGUAGE, lang) + downloader.download_laser3(lang=lang, spm=spm) + file_path = f"laser3-{lang}.v1" + elif laser == "laser2": + downloader.download_laser2() + file_path = "laser2" + else: + raise ValueError( + f"Unsupported laser model: {laser}. Choose either laser2 or laser3." + ) + else: + lang = downloader.get_language_code(LASER3_LANGUAGE, lang) + if lang in LASER3_LANGUAGE: + downloader.download_laser3(lang=lang, spm=spm) + file_path = f"laser3-{lang}.v1" + elif lang in LASER2_LANGUAGE: + downloader.download_laser2() + file_path = "laser2" + else: + raise ValueError( + f"Unsupported language name: {lang}. Please specify a supported language name." + ) + + model_dir = downloader.model_dir + model_path = os.path.join(model_dir, f"{file_path}.pt") + spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab") + + if not os.path.exists(spm_vocab): + # if there is no cvocab for the laser3 lang use laser2 cvocab + spm_vocab = os.path.join(model_dir, "laser2.cvocab") + + return SentenceEncoder(model_path=model_path, spm_vocab=spm_vocab, spm_model=None) + + +class LaserEncoderPipeline: + def __init__( + self, lang: str, model_dir: str = None, spm: bool = True, laser: str = None + ): + self.tokenizer = initialize_tokenizer( + lang=lang, model_dir=model_dir, laser=laser + ) + self.encoder = initialize_encoder( + lang=lang, model_dir=model_dir, spm=spm, laser=laser + ) + + def encode_sentences(self, sentences: list) -> list: + """ + Tokenizes and encodes a list of sentences. + + Args: + - sentences (list of str): List of sentences to tokenize and encode. + + Returns: + - List of embeddings for each sentence. + """ + tokenized_sentences = [ + self.tokenizer.tokenize(sentence) for sentence in sentences + ] + return self.encoder.encode_sentences(tokenized_sentences) From d56a3a89f6b7b12bad4311f73d16a58f48f3b55f Mon Sep 17 00:00:00 2001 From: paul Date: Tue, 24 Oct 2023 15:09:46 +0100 Subject: [PATCH 6/7] style: run black --- laser_encoders/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/laser_encoders/models.py b/laser_encoders/models.py index 69b6633c..037a4f9f 100644 --- a/laser_encoders/models.py +++ b/laser_encoders/models.py @@ -329,6 +329,7 @@ def combine_bidir(outs): else None, } + def initialize_encoder( lang: str = None, model_dir: str = None, From 3fc5ea22dc30f3151c7578fabb5bc6a2ec908c13 Mon Sep 17 00:00:00 2001 From: paul Date: Thu, 26 Oct 2023 21:11:37 +0100 Subject: [PATCH 7/7] test: Add test for LaserEncoderPipeline --- laser_encoders/test_laser_tokenizer.py | 60 +++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/laser_encoders/test_laser_tokenizer.py b/laser_encoders/test_laser_tokenizer.py index 867111cf..1155f8d2 100644 --- a/laser_encoders/test_laser_tokenizer.py +++ b/laser_encoders/test_laser_tokenizer.py @@ -21,7 +21,11 @@ import numpy as np import pytest -from laser_encoders import initialize_encoder, initialize_tokenizer +from laser_encoders import ( + LaserEncoderPipeline, + initialize_encoder, + initialize_tokenizer, +) @pytest.fixture @@ -35,6 +39,27 @@ def input_text() -> str: return "This is a test sentence." +@pytest.fixture +def test_readme_params() -> dict: + return { + "lang": "igbo", + "input_sentences": ["nnọọ, kedu ka ị mere"], + "expected_embedding_shape": (1, 1024), + "expected_array": [ + 0.3807628, + -0.27941525, + -0.17819545, + 0.44144684, + -0.38985375, + 0.04719935, + 0.20238206, + -0.03934783, + 0.0118901, + 0.28986093, + ], + } + + def test_tokenize(tokenizer, input_text: str): expected_output = "▁this ▁is ▁a ▁test ▁sent ence ." assert tokenizer.tokenize(input_text) == expected_output @@ -175,3 +200,36 @@ def test_sentence_encoder( assert isinstance(sentence_embedding, np.ndarray) assert sentence_embedding.shape == (1, 1024) assert np.allclose(expected_array, sentence_embedding[:, :10], atol=1e-3) + + +def test_laser_encoder_pipeline(tmp_path: Path, test_readme_params: dict): + lang = test_readme_params["lang"] + input_sentences = test_readme_params["input_sentences"] + expected_embedding_shape = test_readme_params["expected_embedding_shape"] + expected_array = test_readme_params["expected_array"] + + encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang) + embeddings = encoder.encode_sentences(input_sentences) + + assert isinstance(embeddings, np.ndarray) + assert embeddings.shape == expected_embedding_shape + assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3) + + +def test_separate_initialization_and_encoding( + tmp_path, tokenizer, test_readme_params: dict +): + lang = test_readme_params["lang"] + input_sentences = test_readme_params["input_sentences"] + expected_embedding_shape = test_readme_params["expected_embedding_shape"] + expected_array = test_readme_params["expected_array"] + + tokenized_sentence = tokenizer.tokenize(input_sentences[0]) + sentence_encoder = initialize_encoder(model_dir=tmp_path, lang=lang) + + # Encode tokenized sentences into embeddings + embeddings = sentence_encoder.encode_sentences([tokenized_sentence]) + + assert isinstance(embeddings, np.ndarray) + assert embeddings.shape == expected_embedding_shape + assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)