facebookresearch · heffernankevin · Sep 22, 2023 · Sep 8, 2023 · Sep 8, 2023 · Sep 18, 2023
diff --git a/laser_encoders/README.md b/laser_encoders/README.md
@@ -43,6 +43,13 @@ encoder = initialize_encoder(lang="igbo")
 embeddings = encoder.encode_sentences([tokenized_sentence])
 ```
 
+When initializing the encoder, you have the option to enable both tokenization and encoding by setting the `tokenize` flag to `True`. Below is an example of how to use it:
+```py
+encoder = initialize_encoder(lang="igbo", spm=True, tokenize=True)
+embeddings = encoder("nnọọ, kedu ka ị mere")
+```
+>setting the `spm` flag to `True` tells the encoder to also download the accompanying spm model
+
 **Supported Languages:** You can specify any language from the [FLORES200](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) dataset. This includes both languages identified by their full codes (like "ibo_Latn") and simpler alternatives (like "igbo").
 
 ## Downloading the pre-trained models
@@ -61,13 +68,19 @@ python -m laser_encoders.download_models --model-dir=path/to/model/directory
 
 > For a comprehensive list of available arguments, you can use the `--help` command with the download_models script.
 
-Once you have successfully downloaded the models, you can utilize the `LaserTokenizer` to tokenize text in your desired language. Here's an example of how you can achieve this:
+Once you have successfully downloaded the models, you can utilize the `SentenceEncoder` to tokenize and encode your text in your desired language. Here's an example of how you can achieve this:
 
 ```py
-from laser_encoders.laser_tokenizer import LaserTokenizer
 from laser_encoders.models import SentenceEncoder
 from pathlib import Path
 
+encoder = SentenceEncoder(model_path=path/to/downloaded/model, spm_model=Path(path/to/spm_model), spm_vocab=path/to/cvocab)
+embeddings = encoder("This is a test sentence.")
+```
+If you want to perform tokenization seperately, you can do this below:
+```py
+from laser_encoders.laser_tokenizer import LaserTokenizer
+
 tokenizer = LaserTokenizer(spm_model=Path(path/to/spm_model))
 
 tokenized_sentence = tokenizer.tokenize("This is a test sentence.")

diff --git a/laser_encoders/download_models.py b/laser_encoders/download_models.py
@@ -117,6 +117,7 @@ def initialize_encoder(
     model_dir: str = None,
     spm: bool = True,
     laser: str = None,
+    tokenize: bool = False,
 ):
     downloader = LaserModelDownloader(model_dir)
     if laser is not None:
@@ -146,12 +147,19 @@ def initialize_encoder(
 
     model_dir = downloader.model_dir
     model_path = os.path.join(model_dir, f"{file_path}.pt")
-    spm_path = os.path.join(model_dir, f"{file_path}.cvocab")
-
-    if not os.path.exists(spm_path):
+    spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab")
+    spm_model = None
+    if not os.path.exists(spm_vocab):
         # if there is no cvocab for the laser3 lang use laser2 cvocab
-        spm_path = os.path.join(model_dir, "laser2.cvocab")
-    return SentenceEncoder(model_path=model_path, spm_vocab=spm_path)
+        spm_vocab = os.path.join(model_dir, "laser2.cvocab")
+    if tokenize:
+        spm_model = os.path.join(model_dir, f"{file_path}.spm")
+        if not os.path.exists(spm_model):
+            spm_model = os.path.join(model_dir, "laser2.spm")
+
+    return SentenceEncoder(
+        model_path=model_path, spm_vocab=spm_vocab, spm_model=spm_model
+    )
 
 
 def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None):

diff --git a/laser_encoders/models.py b/laser_encoders/models.py
@@ -17,6 +17,7 @@
 import re
 import sys
 from collections import namedtuple
+from pathlib import Path
 
 import numpy as np
 import torch
@@ -25,6 +26,8 @@
 from fairseq.models.transformer import Embedding, TransformerEncoder
 from fairseq.modules import LayerNorm
 
+from laser_encoders.laser_tokenizer import LaserTokenizer
+
 SPACE_NORMALIZER = re.compile(r"\s+")
 Batch = namedtuple("Batch", "srcs tokens lengths")
 
@@ -43,13 +46,18 @@ def __init__(
         max_sentences=None,
         max_tokens=None,
         spm_vocab=None,
+        spm_model=None,
         cpu=False,
         fp16=False,
         verbose=False,
         sort_kind="quicksort",
     ):
         if verbose:
             logger.info(f"loading encoder: {model_path}")
+        self.spm_model = spm_model
+        if self.spm_model:
+            self.tokenizer = LaserTokenizer(spm_model=Path(self.spm_model))
+
         self.use_cuda = torch.cuda.is_available() and not cpu
         self.max_sentences = max_sentences
         self.max_tokens = max_tokens
@@ -83,6 +91,15 @@ def __init__(
         self.encoder.eval()
         self.sort_kind = sort_kind
 
+    def __call__(self, sentences):
+        if self.spm_model:
+            sentences = self.tokenizer(sentences)
+            return self.encode_sentences(sentences)
+        else:
+            raise ValueError(
+                "Either initialize the encoder with an spm_model or pre-tokenize and use the encode_sentences method."
+            )
+
     def _process_batch(self, batch):
         tokens = batch.tokens
         lengths = batch.lengths