Merge pull request #17 from SCAI-BIO/add-examples

Add examples
SCAI-BIO · Feb 26, 2024 · 3aa86fe · 3aa86fe
2 parents 4ddc998 + 001747a
commit 3aa86fe
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -64,8 +64,10 @@ docker run  -p 8000:80 ghcr.io/scai-bio/backend:latest
 
 ### Python
 
+#### Creating and using stored mappings
+
 A simple example how to initialize an in memory database and compute a similarity mapping is shown in 
-[index/scripts/mapping_example.py](index/scripts/mapping_example.py):
+[index/scripts/mapping_db_example.py](index/scripts/mapping_db_example.py):
 
 ```python
 # omit mode to create a permanent db file instead
@@ -102,6 +104,24 @@ You can also import data from file sources (csv, tsv, xlsx) or from a public API
 download & compute embeddings for SNOMED from ebi OLS can be found in 
 [index/scripts/ols_snomed_retrieval.py](index/scripts/ols_snomed_retrieval.py).
 
+#### Harmonizing excel/csv resources
+
+You can directly import common data models, terminology sources or data dictionaries for harmonization directly from a
+csv, tsv or excel file. An example how to match two seperate variable descriptions is shown in
+[index/scripts/mapping_excel_example.py](index/scripts/mapping_excel_example.py):
+
+```python
+# Variable and description refer to the corresponding column names in your excel sheet
+source = DataDictionarySource("source.xlxs", variable_field="var", description_field="desc")
+target = DataDictionarySource("target.xlxs", variable_field="var", description_field="desc")
+
+df = map_dictionary_to_dictionary(source, target)
+df.to_excel("result.xlxs")
+```
+
+The resulting file contains the pairwise variable mapping based on the closest similarity for all possible matches 
+as well as a similarity measure per row.
+
 ## Configuration
 
 ### Description Embeddings

diff --git a/index/process/mapping.py b/index/process/mapping.py
@@ -0,0 +1,34 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics.pairwise import cosine_similarity
+
+from index.embedding import EmbeddingModel, MPNetAdapter
+from index.process.parsing import DataDictionarySource
+
+
+def map_dictionary_to_dictionary(source: DataDictionarySource,
+                                 target: DataDictionarySource,
+                                 embedding_model: EmbeddingModel = MPNetAdapter()) -> pd.DataFrame:
+    # Load data
+    df_source = source.to_dataframe()
+    df_target = target.to_dataframe()
+
+    # Compute embeddings
+    embeddings_source = embedding_model.get_embeddings(df_source["description"].tolist())
+    embeddings_target = embedding_model.get_embeddings(df_target["description"].tolist())
+
+    # Compute cosine similarities
+    similarities = cosine_similarity(embeddings_source, embeddings_target)
+
+    # Find closest matches
+    max_similarities = np.max(similarities, axis=1)
+    closest_match_indices = np.argmax(similarities, axis=1)
+
+    # Create DataFrame for closest matches
+    result_df = pd.DataFrame({
+        'Source Variable': df_source["variable"],
+        'Target Variable': df_target.iloc[closest_match_indices]["variable"].values,
+        'Similarity': max_similarities
+    })
+
+    return result_df
diff --git a/index/scripts/mapping_example.py → index/scripts/mapping_db_example.py b/index/scripts/mapping_example.py → index/scripts/mapping_db_example.py
diff --git a/index/scripts/mapping_excel_example.py b/index/scripts/mapping_excel_example.py
@@ -0,0 +1,9 @@
+from index.process.mapping import map_dictionary_to_dictionary
+from index.process.parsing import DataDictionarySource
+
+# Variable and description refer to the corresponding column names in your excel sheet
+source = DataDictionarySource("source.xlxs", variable_field="var", description_field="desc")
+target = DataDictionarySource("target.xlxs", variable_field="var", description_field="desc")
+
+df = map_dictionary_to_dictionary(source, target)
+df.to_excel("result.xlxs")
diff --git a/tests/test_mapping.py b/tests/test_mapping.py
@@ -0,0 +1,19 @@
+import os
+import unittest
+from index.embedding import MPNetAdapter, TextEmbedding
+import numpy as np
+
+from index.process.mapping import map_dictionary_to_dictionary
+from index.process.parsing import DataDictionarySource
+
+
+class TestEmbedding(unittest.TestCase):
+
+    TEST_DIR_PATH = os.path.dirname(os.path.realpath(__file__))
+
+    data_dictionary_source = DataDictionarySource(os.path.join(TEST_DIR_PATH, "resources", 'test_data_dict.csv'),
+                                                  "VAR_1", "DESC")
+
+    def test_map_dictionary_to_dictionary(self):
+        df = map_dictionary_to_dictionary(self.data_dictionary_source, self.data_dictionary_source)
+        print(df)