Skip to content

Commit

Permalink
[emb] Fix hf emb path and apply normalization (#92)
Browse files Browse the repository at this point in the history
* [emb] Fix hf emb path, and apply normalization

* enhance comments in env templates

* update tag
  • Loading branch information
finaldie authored Aug 29, 2024
1 parent 649a559 commit 3135254
Show file tree
Hide file tree
Showing 9 changed files with 46 additions and 14 deletions.
7 changes: 7 additions & 0 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ OLLAMA_URL=http://localhost:11434

# The generic Text embedding provider. Supported providers:
# openai, hf, hf_inst, ollama
#
# **Attention**: Change embedding provider/model needs to re-create
# embedding collections if the dimension is different.
# Dimensions for different provider/model:
# - OpenAI: 1536
# - Ollama(nomic-embed-text): 768
# - HF (all-MiniLM-L6-v2): 384
EMBEDDING_PROVIDER=openai

# models
Expand Down
7 changes: 7 additions & 0 deletions .env.template.k8s
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ OLLAMA_URL=http://localhost:11434

# The generic Text embedding provider. Supported providers:
# openai, hf, hf_inst, ollama
#
# **Attention**: Change embedding provider/model needs to re-create
# embedding collections if the dimension is different.
# Dimensions for different provider/model:
# - OpenAI: 1536
# - Ollama(nomic-embed-text): 768
# - HF (all-MiniLM-L6-v2): 384
EMBEDDING_PROVIDER=openai

# models:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ deps: prepare-env
# deps: docker-network

repo ?= finaldie/auto-news
tag ?= 0.9.12
tag ?= 0.9.13

build:
cd docker && make build repo=$(repo) tag=$(tag) topdir=$(topdir)
Expand Down
2 changes: 1 addition & 1 deletion docker/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ x-airflow-common:
# Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
# and uncomment the "build" line below, Then run `docker-compose build` to build the images.
# image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.5.2}
image: ${AIRFLOW_IMAGE_NAME:-finaldie/auto-news:0.9.12}
image: ${AIRFLOW_IMAGE_NAME:-finaldie/auto-news:0.9.13}
# build: .

networks:
Expand Down
2 changes: 1 addition & 1 deletion docker/portainer/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ x-airflow-common:
# Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
# and uncomment the "build" line below, Then run `docker-compose build` to build the images.
# image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.5.2}
image: ${AIRFLOW_IMAGE_NAME:-finaldie/auto-news:0.9.12}
image: ${AIRFLOW_IMAGE_NAME:-finaldie/auto-news:0.9.13}
# build: .

networks:
Expand Down
2 changes: 1 addition & 1 deletion helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ airflow:
images:
airflow:
repository: finaldie/auto-news
tag: 0.9.12
tag: 0.9.13

useDefaultImageForMigration: true

Expand Down
14 changes: 10 additions & 4 deletions src/embedding_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from embedding import Embedding
from db_cli import DBClient
import utils
import embedding_utils as emb_utils


class EmbeddingHuggingFace(Embedding):
Expand All @@ -12,7 +13,7 @@ class EmbeddingHuggingFace(Embedding):
from HuggingFace)
"""
def __init__(self, model_name="all-MiniLM-L6-v2"):
super.__init__(model_name)
super().__init__(model_name)

self.api = HuggingFaceEmbeddings(model_name=self.model_name)
print("Initialized EmbeddingHuggingFace")
Expand All @@ -23,12 +24,17 @@ def dim(self):
def getname(self, start_date, prefix="news"):
return f"{prefix}_embedding_hf_{start_date}".replace("-", "_")

def create(self, text: str):
def create(self, text: str, normalize=True):
"""
It creates the embedding with 1536 dimentions by default
Query local HF embedding model
"""

return self.api.embed_query(text)
emb = self.api.embed_query(text)

if normalize:
emb = emb_utils.l2_norm(emb)

return emb

def get_or_create(
self,
Expand Down
14 changes: 10 additions & 4 deletions src/embedding_hf_inst.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from embedding import Embedding
from db_cli import DBClient
import utils
import embedding_utils as emb_utils


class EmbeddingHuggingFaceInstruct(Embedding):
Expand All @@ -12,7 +13,7 @@ class EmbeddingHuggingFaceInstruct(Embedding):
from HuggingFace)
"""
def __init__(self, model_name="hkunlp/instructor-xl"):
super.__init__(model_name)
super().__init__(model_name)

self.api = HuggingFaceInstructEmbeddings(model_name=self.model_name)
print("Initialized EmbeddingHuggingFaceInstruct")
Expand All @@ -23,12 +24,17 @@ def dim(self):
def getname(self, start_date, prefix="news"):
return f"{prefix}_embedding_hf_inst_{start_date}".replace("-", "_")

def create(self, text: str):
def create(self, text: str, normalize=True):
"""
It creates the embedding with 384 dimentions by default
Query local HF embedding model
"""

return self.api.embed_query(text)
emb = self.api.embed_query(text)

if normalize:
emb = emb_utils.l2_norm(emb)

return emb

def get_or_create(
self,
Expand Down
10 changes: 8 additions & 2 deletions src/embedding_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
###############################################################################
########################################################################
# Embedding Utils
###############################################################################
########################################################################
import numpy as np


def similarity_topk(embedding_items: list, metric_type, threshold=None, k=3):
"""
Expand Down Expand Up @@ -64,3 +66,7 @@ def similarity_topk_cosine(items: list, threshold, k):

# The returned value is sorted by most similar -> least similar
return sorted_items[:k]


def l2_norm(emb):
return (np.array(emb) / np.linalg.norm(emb)).tolist()

0 comments on commit 3135254

Please sign in to comment.