Skip to content

Commit

Permalink
Add model2vec vectorization, closes #801
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmezzetti committed Oct 21, 2024
1 parent 4a26804 commit a246052
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 3 deletions.
10 changes: 7 additions & 3 deletions docs/embeddings/configuration/vectors.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ Uses default vector model path when enabled (default setting is True) and `path`

## method
```yaml
method: transformers|sentence-transformers|llama.cpp|litellm|external|words
method: transformers|sentence-transformers|llama.cpp|litellm|model2vec|external|words
```

Embeddings method to use. If the method is not provided, it is inferred using the `path`.

`sentence-transformers`, `llama.cpp`, `litellm` and `words` require the [vectors](../../../install/#vectors) extras package to be installed.
`sentence-transformers`, `llama.cpp`, `litellm`, `model2vec` and `words` require the [vectors](../../../install/#vectors) extras package to be installed.

### transformers

Expand All @@ -42,12 +42,16 @@ Same as transformers but loads models with the [sentence-transformers](https://g

### llama.cpp

Builds embeddings using a llama.cpp model. Supports both local and remote GGUF paths on the HF Hub.
Builds embeddings using a [llama.cpp](https://github.com/abetlen/llama-cpp-python) model. Supports both local and remote GGUF paths on the HF Hub.

### litellm

Builds embeddings using a LiteLLM model. See the [LiteLLM documentation](https://litellm.vercel.app/docs/providers) for the options available with LiteLLM models.

### model2vec

Builds embeddings using a [Model2Vec](https://github.com/MinishLab/model2vec) model.

### words

Builds embeddings using a word embeddings model. Transformers models are the preferred vector backend in most cases. Word embeddings models may be deprecated in the future.
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
"fasttext>=0.9.2",
"litellm>=1.37.16",
"llama-cpp-python>=0.2.75",
"model2vec>=0.3.0",
"pymagnitude-lite>=0.1.43",
"scikit-learn>=0.23.1",
"sentence-transformers>=2.2.0",
Expand Down
1 change: 1 addition & 0 deletions src/python/txtai/vectors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@
from .huggingface import HFVectors
from .litellm import LiteLLM
from .llama import LlamaCpp
from .m2v import Model2Vec
from .words import WordVectors
5 changes: 5 additions & 0 deletions src/python/txtai/vectors/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .huggingface import HFVectors
from .litellm import LiteLLM
from .llama import LlamaCpp
from .m2v import Model2Vec
from .words import WordVectors


Expand Down Expand Up @@ -45,6 +46,10 @@ def create(config, scoring=None, models=None):
if method == "llama.cpp":
return LlamaCpp(config, scoring, models)

# Model2vec vectors
if method == "model2vec":
return Model2Vec(config, scoring, models)

# Word vectors
if method == "words":
return WordVectors(config, scoring, models)
Expand Down
32 changes: 32 additions & 0 deletions src/python/txtai/vectors/m2v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Model2Vec module
"""

# Conditional import
try:
from model2vec import StaticModel

MODEL2VEC = True
except ImportError:
MODEL2VEC = False

from .base import Vectors


class Model2Vec(Vectors):
"""
Builds vectors using Model2Vec.
"""

def __init__(self, config, scoring, models):
# Check before parent constructor since it calls loadmodel
if not MODEL2VEC:
raise ImportError('Model2Vec is not available - install "vectors" extra to enable')

super().__init__(config, scoring, models)

def loadmodel(self, path):
return StaticModel.from_pretrained(path)

def encode(self, data):
return self.model.encode(data)
4 changes: 4 additions & 0 deletions test/python/testoptional.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def setUpClass(cls):
"libcloud.storage.providers",
"litellm",
"llama_cpp",
"model2vec",
"networkx",
"nltk",
"onnxmltools",
Expand Down Expand Up @@ -277,6 +278,9 @@ def testVectors(self):
with self.assertRaises(ImportError):
VectorsFactory.create({"method": "llama.cpp", "path": "nomic-ai/nomic-embed-text-v1.5-GGUF/nomic-embed-text-v1.5.Q2_K.gguf"}, None)

with self.assertRaises(ImportError):
VectorsFactory.create({"method": "model2vec", "path": "minishlab/M2V_base_output"}, None)

with self.assertRaises(ImportError):
VectorsFactory.create({"method": "sentence-transformers", "path": "sentence-transformers/nli-mpnet-base-v2"}, None)

Expand Down
40 changes: 40 additions & 0 deletions test/python/testvectors/testm2v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
Model2Vec module tests
"""

import os
import unittest

import numpy as np

from txtai.vectors import VectorsFactory


class TestModel2Vec(unittest.TestCase):
"""
Model2vec vectors tests
"""

@classmethod
def setUpClass(cls):
"""
Create Model2Vec instance.
"""

cls.model = VectorsFactory.create({"method": "model2vec", "path": "minishlab/M2V_base_output"}, None)

def testIndex(self):
"""
Test indexing with Model2Vec vectors.
"""

ids, dimension, batches, stream = self.model.index([(0, "test", None)])

self.assertEqual(len(ids), 1)
self.assertEqual(dimension, 256)
self.assertEqual(batches, 1)
self.assertIsNotNone(os.path.exists(stream))

# Test shape of serialized embeddings
with open(stream, "rb") as queue:
self.assertEqual(np.load(queue).shape, (1, 256))

0 comments on commit a246052

Please sign in to comment.