-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #17 from SCAI-BIO/add-examples
Add examples
- Loading branch information
Showing
5 changed files
with
83 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from sklearn.metrics.pairwise import cosine_similarity | ||
|
||
from index.embedding import EmbeddingModel, MPNetAdapter | ||
from index.process.parsing import DataDictionarySource | ||
|
||
|
||
def map_dictionary_to_dictionary(source: DataDictionarySource, | ||
target: DataDictionarySource, | ||
embedding_model: EmbeddingModel = MPNetAdapter()) -> pd.DataFrame: | ||
# Load data | ||
df_source = source.to_dataframe() | ||
df_target = target.to_dataframe() | ||
|
||
# Compute embeddings | ||
embeddings_source = embedding_model.get_embeddings(df_source["description"].tolist()) | ||
embeddings_target = embedding_model.get_embeddings(df_target["description"].tolist()) | ||
|
||
# Compute cosine similarities | ||
similarities = cosine_similarity(embeddings_source, embeddings_target) | ||
|
||
# Find closest matches | ||
max_similarities = np.max(similarities, axis=1) | ||
closest_match_indices = np.argmax(similarities, axis=1) | ||
|
||
# Create DataFrame for closest matches | ||
result_df = pd.DataFrame({ | ||
'Source Variable': df_source["variable"], | ||
'Target Variable': df_target.iloc[closest_match_indices]["variable"].values, | ||
'Similarity': max_similarities | ||
}) | ||
|
||
return result_df |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from index.process.mapping import map_dictionary_to_dictionary | ||
from index.process.parsing import DataDictionarySource | ||
|
||
# Variable and description refer to the corresponding column names in your excel sheet | ||
source = DataDictionarySource("source.xlxs", variable_field="var", description_field="desc") | ||
target = DataDictionarySource("target.xlxs", variable_field="var", description_field="desc") | ||
|
||
df = map_dictionary_to_dictionary(source, target) | ||
df.to_excel("result.xlxs") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import os | ||
import unittest | ||
from index.embedding import MPNetAdapter, TextEmbedding | ||
import numpy as np | ||
|
||
from index.process.mapping import map_dictionary_to_dictionary | ||
from index.process.parsing import DataDictionarySource | ||
|
||
|
||
class TestEmbedding(unittest.TestCase): | ||
|
||
TEST_DIR_PATH = os.path.dirname(os.path.realpath(__file__)) | ||
|
||
data_dictionary_source = DataDictionarySource(os.path.join(TEST_DIR_PATH, "resources", 'test_data_dict.csv'), | ||
"VAR_1", "DESC") | ||
|
||
def test_map_dictionary_to_dictionary(self): | ||
df = map_dictionary_to_dictionary(self.data_dictionary_source, self.data_dictionary_source) | ||
print(df) |