Add model2vec vectorization, closes #801

neuml · Oct 21, 2024 · a246052 · a246052
1 parent 4a26804
commit a246052
Show file tree

Hide file tree

Showing 7 changed files with 90 additions and 3 deletions.
diff --git a/docs/embeddings/configuration/vectors.md b/docs/embeddings/configuration/vectors.md
@@ -19,12 +19,12 @@ Uses default vector model path when enabled (default setting is True) and `path`
 
 ## method
 ```yaml
-method: transformers|sentence-transformers|llama.cpp|litellm|external|words
+method: transformers|sentence-transformers|llama.cpp|litellm|model2vec|external|words
 ```
 
 Embeddings method to use. If the method is not provided, it is inferred using the `path`.
 
-`sentence-transformers`, `llama.cpp`, `litellm` and `words` require the [vectors](../../../install/#vectors) extras package to be installed.
+`sentence-transformers`, `llama.cpp`, `litellm`, `model2vec` and `words` require the [vectors](../../../install/#vectors) extras package to be installed.
 
 ### transformers
 
@@ -42,12 +42,16 @@ Same as transformers but loads models with the [sentence-transformers](https://g
 
 ### llama.cpp
 
-Builds embeddings using a llama.cpp model. Supports both local and remote GGUF paths on the HF Hub.
+Builds embeddings using a [llama.cpp](https://github.com/abetlen/llama-cpp-python) model. Supports both local and remote GGUF paths on the HF Hub.
 
 ### litellm
 
 Builds embeddings using a LiteLLM model. See the [LiteLLM documentation](https://litellm.vercel.app/docs/providers) for the options available with LiteLLM models.
 
+### model2vec
+
+Builds embeddings using a [Model2Vec](https://github.com/MinishLab/model2vec) model.
+
 ### words
 
 Builds embeddings using a word embeddings model. Transformers models are the preferred vector backend in most cases. Word embeddings models may be deprecated in the future.

diff --git a/setup.py b/setup.py
@@ -96,6 +96,7 @@
     "fasttext>=0.9.2",
     "litellm>=1.37.16",
     "llama-cpp-python>=0.2.75",
+    "model2vec>=0.3.0",
     "pymagnitude-lite>=0.1.43",
     "scikit-learn>=0.23.1",
     "sentence-transformers>=2.2.0",

diff --git a/src/python/txtai/vectors/__init__.py b/src/python/txtai/vectors/__init__.py
@@ -8,4 +8,5 @@
 from .huggingface import HFVectors
 from .litellm import LiteLLM
 from .llama import LlamaCpp
+from .m2v import Model2Vec
 from .words import WordVectors
diff --git a/src/python/txtai/vectors/factory.py b/src/python/txtai/vectors/factory.py
@@ -8,6 +8,7 @@
 from .huggingface import HFVectors
 from .litellm import LiteLLM
 from .llama import LlamaCpp
+from .m2v import Model2Vec
 from .words import WordVectors
 
 
@@ -45,6 +46,10 @@ def create(config, scoring=None, models=None):
         if method == "llama.cpp":
             return LlamaCpp(config, scoring, models)
 
+        # Model2vec vectors
+        if method == "model2vec":
+            return Model2Vec(config, scoring, models)
+
         # Word vectors
         if method == "words":
             return WordVectors(config, scoring, models)

diff --git a/src/python/txtai/vectors/m2v.py b/src/python/txtai/vectors/m2v.py
@@ -0,0 +1,32 @@
+"""
+Model2Vec module
+"""
+
+# Conditional import
+try:
+    from model2vec import StaticModel
+
+    MODEL2VEC = True
+except ImportError:
+    MODEL2VEC = False
+
+from .base import Vectors
+
+
+class Model2Vec(Vectors):
+    """
+    Builds vectors using Model2Vec.
+    """
+
+    def __init__(self, config, scoring, models):
+        # Check before parent constructor since it calls loadmodel
+        if not MODEL2VEC:
+            raise ImportError('Model2Vec is not available - install "vectors" extra to enable')
+
+        super().__init__(config, scoring, models)
+
+    def loadmodel(self, path):
+        return StaticModel.from_pretrained(path)
+
+    def encode(self, data):
+        return self.model.encode(data)
diff --git a/test/python/testoptional.py b/test/python/testoptional.py
@@ -33,6 +33,7 @@ def setUpClass(cls):
             "libcloud.storage.providers",
             "litellm",
             "llama_cpp",
+            "model2vec",
             "networkx",
             "nltk",
             "onnxmltools",
@@ -277,6 +278,9 @@ def testVectors(self):
         with self.assertRaises(ImportError):
             VectorsFactory.create({"method": "llama.cpp", "path": "nomic-ai/nomic-embed-text-v1.5-GGUF/nomic-embed-text-v1.5.Q2_K.gguf"}, None)
 
+        with self.assertRaises(ImportError):
+            VectorsFactory.create({"method": "model2vec", "path": "minishlab/M2V_base_output"}, None)
+
         with self.assertRaises(ImportError):
             VectorsFactory.create({"method": "sentence-transformers", "path": "sentence-transformers/nli-mpnet-base-v2"}, None)
 

diff --git a/test/python/testvectors/testm2v.py b/test/python/testvectors/testm2v.py
@@ -0,0 +1,40 @@
+"""
+Model2Vec module tests
+"""
+
+import os
+import unittest
+
+import numpy as np
+
+from txtai.vectors import VectorsFactory
+
+
+class TestModel2Vec(unittest.TestCase):
+    """
+    Model2vec vectors tests
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        """
+        Create Model2Vec instance.
+        """
+
+        cls.model = VectorsFactory.create({"method": "model2vec", "path": "minishlab/M2V_base_output"}, None)
+
+    def testIndex(self):
+        """
+        Test indexing with Model2Vec vectors.
+        """
+
+        ids, dimension, batches, stream = self.model.index([(0, "test", None)])
+
+        self.assertEqual(len(ids), 1)
+        self.assertEqual(dimension, 256)
+        self.assertEqual(batches, 1)
+        self.assertIsNotNone(os.path.exists(stream))
+
+        # Test shape of serialized embeddings
+        with open(stream, "rb") as queue:
+            self.assertEqual(np.load(queue).shape, (1, 256))