Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Display descriptions and add the option to display the k most s… #46

Merged
merged 2 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 50 additions & 10 deletions datastew/process/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,24 @@

def map_dictionary_to_dictionary(source: DataDictionarySource,
target: DataDictionarySource,
embedding_model: EmbeddingModel = MPNetAdapter()) -> pd.DataFrame:
embedding_model: EmbeddingModel = MPNetAdapter(),
limit: int = 1) -> pd.DataFrame:
"""
Map variables from a source data dictionary to the closest matching variables in a target data dictionary
based on the similarity of their descriptions.

:param source: The source data dictionary containing variables and their descriptions.
:param target: The target data dictionary containing variables and their descriptions to be matched against.
:param embedding_model: The model used to convert descriptions into embeddings for similarity comparison.
Defaults to MPNetAdapter().
:param limit: The number of closest matches to retrieve for each source variable. Defaults to 1.
:return: A DataFrame containing the closest matches with the following columns:
- 'Source Variable': The variable names from the source data dictionary.
- 'Target Variable': The closest matching variable names from the target data dictionary.
- 'Source Description': The descriptions of the variables from the source data dictionary.
- 'Target Description': The descriptions of the closest matching variables from the target data dictionary.
- 'Similarity': The cosine similarity score between the source and target variable descriptions.
"""
# Load data
df_source = source.to_dataframe()
df_target = target.to_dataframe()
Expand All @@ -20,15 +37,38 @@ def map_dictionary_to_dictionary(source: DataDictionarySource,
# Compute cosine similarities
similarities = cosine_similarity(embeddings_source, embeddings_target)

# Find closest matches
max_similarities = np.max(similarities, axis=1)
closest_match_indices = np.argmax(similarities, axis=1)
if limit == 1:
# Find the closest matches
max_similarities = np.max(similarities, axis=1)
closest_match_indices = np.argmax(similarities, axis=1)

# Create DataFrame for closest matches
result_df = pd.DataFrame({
'Source Variable': df_source["variable"],
'Target Variable': df_target.iloc[closest_match_indices]["variable"].values,
'Source Description': df_source["description"],
'Target Description': df_target.iloc[closest_match_indices]["description"].values,
'Similarity': max_similarities
})

else:
if limit > len(df_target):
ValueError(f"The limit {limit} cannot be greater than the number of target variables {len(df_target)}.")

# Get the indices of the top 'limit' matches for each source variable
top_matches_indices = np.argsort(similarities, axis=1)[:, -limit:][:, ::-1]

# Flatten indices for easier DataFrame construction
flat_indices = top_matches_indices.flatten()
source_repeated = np.repeat(df_source.index, limit)

# Create DataFrame for closest matches
result_df = pd.DataFrame({
'Source Variable': df_source["variable"],
'Target Variable': df_target.iloc[closest_match_indices]["variable"].values,
'Similarity': max_similarities
})
# Create DataFrame for closest matches
result_df = pd.DataFrame({
'Source Variable': df_source.iloc[source_repeated]["variable"].values,
'Target Variable': df_target.iloc[flat_indices]["variable"].values,
'Source Description': df_source.iloc[source_repeated]["description"].values,
'Target Description': df_target.iloc[flat_indices]["description"].values,
'Similarity': np.take_along_axis(similarities, top_matches_indices, axis=1).flatten()
})

return result_df
2 changes: 1 addition & 1 deletion tests/test_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ class TestEmbedding(unittest.TestCase):
"VAR_1", "DESC")

def test_map_dictionary_to_dictionary(self):
df = map_dictionary_to_dictionary(self.data_dictionary_source, self.data_dictionary_source)
df = map_dictionary_to_dictionary(self.data_dictionary_source, self.data_dictionary_source, limit=2)
print(df)
Loading