Merge branch 'main' into add-database-integration

SCAI-BIO · Feb 22, 2024 · f067d8e · f067d8e
2 parents eef31f4 + e66c3f0
commit f067d8e
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -3,20 +3,15 @@
 INDEX is an intelligent data steward toolbox that leverages Large Language Model embeddings for automated Data-Harmonization. 
 
 ## Table of Contents
-- [Introduction](##ntroduction)
-- [Installation & Usage](#installation)
+- [Introduction](#introduction)
+- [Installation](#installation)
 - [Configuration](#configuration)
 
 ## Introduction
 
-INDEX relies on vector embeddings calculated based on variable descriptions to generate mapping suggestions for any 
-dataset, enabling efficient and accurate data indexing and retrieval. Confirmed mappings are stored alongside their 
-vectorized representations in a knowledge base, facilitating rapid search and retrieval operations, ultimately enhancing 
-data management and analysis capabilities. New mappings may be added to the knowledge base in an iterative procedure,
-allowing for improved mapping suggestions in subsequent harmonization tasks.
-
-## Installation & Usage
+INDEX uses vector embeddings from variable descriptions to suggest mappings for datasets, improving data indexing and retrieval. Confirmed mappings are stored with their vector representations in a knowledge base for fast search and retrieval, enhancing data management and analysis. New mappings can be added iteratively to improve suggestions for future harmonization tasks.
 
+## Installation
 Clone the repository:
 
 ```bash
@@ -41,12 +36,23 @@ uvicorn main:app --reload --port 5000
 
 ### Run the Backend via Docker
 
-Download the latest docker build:
+You can either build the docker container locally or downlaod the latest build from the index github package registry. 
+
+
+```bash
+docker build . -t ghcr.io/scai-bio/backend:latest
+```
 
 ```bash
 docker pull ghcr.io/scai-bio/backend:latest
 ```
 
+After build/download you will be able to start the container and access the IDNEX API per default on [localhost:8000](http://localhost:8000):
+
+```bash
+docker run  -p 8000:80 ghcr.io/scai-bio/backend:latest
+```
+
 ## Configuration
 
 ### Description Embeddings
@@ -55,4 +61,10 @@ You can configure INDEX to use either a local language model or call OPenAPIs em
 is significantly faster, you will need to provide an API key that is linked to your OpenAI account. 
 
 Currently, the following local models are implemented:
-* [MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)
+* [MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)
+
+The API will default to use a local embedding model. You can adjust the model loaded on start up in the configurations.
+
+### Database
+
+INDEX will be default store mappings in a file based db file in the [following directory](https://github.com/SCAI-BIO/index/tree/main/index/db).
diff --git a/index/embedding.py b/index/embedding.py
@@ -39,25 +39,27 @@ def get_embeddings(self, messages: [str], model="text-embedding-ada-002"):
 
 
 class MPNetAdapter(EmbeddingModel):
-    def __init__(self):
+    def __init__(self, model="sentence-transformers/all-mpnet-base-v2"):
         logging.getLogger().setLevel(logging.INFO)
+        self.mpnet_model = SentenceTransformer(model)
 
-    def get_embedding(self, text: str, model="sentence-transformers/all-mpnet-base-v2"):
-        mpnet_model = SentenceTransformer(model)
+    def get_embedding(self, text: str):
         logging.info(f"Getting embedding for {text}")
         try:
             if text is None or text == "" or text is np.nan:
                 logging.warn(f"Empty text passed to get_embedding")
                 return None
             if isinstance(text, str):
                 text = text.replace("\n", " ")
-            return mpnet_model.encode(text)
+            return self.mpnet_model.encode(text)
         except Exception as e:
             logging.error(f"Error getting embedding for {text}: {e}")
             return None
 
     def get_embeddings(self, messages: [str]) -> [[float]]:
-        return [self.get_embedding(msg) for msg in messages]
+        embeddings = self.mpnet_model.encode(messages)
+        flattened_embeddings = [[float(element) for element in row] for row in embeddings]
+        return flattened_embeddings
 
 
 class TextEmbedding:

diff --git a/tests/test_embedding.py b/tests/test_embedding.py
@@ -0,0 +1,28 @@
+import unittest
+from index.embedding import MPNetAdapter, TextEmbedding
+import numpy as np
+
+class TestEmbedding(unittest.TestCase):
+
+    def setUp(self):
+        self.mpnet_adapter = MPNetAdapter(model="sentence-transformers/all-mpnet-base-v2")
+
+    def test_mpnet_adapter_get_embedding(self):
+        text = "This is a test sentence."
+        embedding = self.mpnet_adapter.get_embedding(text)
+        self.assertIsInstance(embedding, np.ndarray)
+        self.assertEqual(len(embedding), 768)
+
+    def test_mpnet_adapter_get_embeddings(self):
+        messages = ["This is message 1.", "This is message 2."]
+        embeddings = self.mpnet_adapter.get_embeddings(messages)
+        self.assertIsInstance(embeddings, list)
+        self.assertEqual(len(embeddings), len(messages))
+        self.assertEqual(len(embeddings[0]), 768)
+
+    def test_text_embedding(self):
+        text = "This is a test sentence."
+        embedding = [0.1, 0.2, 0.3, 0.4]
+        text_embedding = TextEmbedding(text, embedding)
+        self.assertEqual(text_embedding.text, text)
+        self.assertEqual(text_embedding.embedding, embedding)