-
Notifications
You must be signed in to change notification settings - Fork 0
/
embedding_model.py
86 lines (73 loc) · 3.28 KB
/
embedding_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import numpy as np
try:
from openai import OpenAI
OPENAI_AVAILABLE = True
except ImportError:
OPENAI_AVAILABLE = False
print("OpenAI module not found. Using fallback embedding method.")
class EmbeddingModel:
def __init__(self, model_name='text-embedding-3-small', use_openrouter=False):
self.model_name = model_name
self.use_openrouter = use_openrouter
if OPENAI_AVAILABLE and not use_openrouter:
self.client = OpenAI()
else:
# Fallback to a simple embedding method
self.dimension = 100
if use_openrouter:
from openrouter_client import OpenRouterClient
self.openrouter_client = OpenRouterClient()
def get_embedding(self, text):
"""
Generate an embedding for the given text.
:param text: Input text string
:return: Numpy array representing the embedding
"""
text = text.replace("\n", " ")
if OPENAI_AVAILABLE and not self.use_openrouter:
return np.array(self.client.embeddings.create(input=[text], model=self.model_name).data[0].embedding)
elif self.use_openrouter:
return np.array(self.openrouter_client.generate_embedding(text, model=self.model_name))
else:
return self._fallback_embedding(text)
def get_embeddings(self, texts):
"""
Generate embeddings for a list of texts.
:param texts: List of input text strings
:return: List of numpy arrays representing the embeddings
"""
texts = [text.replace("\n", " ") for text in texts]
if OPENAI_AVAILABLE:
embeddings = self.client.embeddings.create(input=texts, model=self.model_name).data
return [np.array(embedding.embedding) for embedding in embeddings]
else:
return [self._fallback_embedding(text) for text in texts]
def cosine_similarity(self, embedding1, embedding2):
"""
Calculate the cosine similarity between two embeddings.
:param embedding1: First embedding (numpy array)
:param embedding2: Second embedding (numpy array)
:return: Cosine similarity score
"""
return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
def get_embedding_dimension(self):
"""
Get the dimension of the embeddings produced by this model.
:return: Integer representing the embedding dimension
"""
if OPENAI_AVAILABLE:
return 1536 # OpenAI's text-embedding-3-small model produces 1536-dimensional embeddings
else:
return self.dimension
def _fallback_embedding(self, text):
"""
A simple fallback method to generate embeddings when OpenAI is not available.
:param text: Input text string
:return: Numpy array representing the embedding
"""
# This is a very simplistic embedding method and should be replaced with a more sophisticated one
words = text.lower().split()
embedding = np.zeros(self.dimension)
for i, word in enumerate(words[:self.dimension]):
embedding[i] = hash(word) % 100 # Using hash for simplicity
return embedding / np.linalg.norm(embedding)