From 9cdf7f6be900da5a7ce7c8a9a8e6f9a20ed3a049 Mon Sep 17 00:00:00 2001 From: Mehmet Can Ay Date: Wed, 14 Feb 2024 14:52:33 +0100 Subject: [PATCH 1/3] update: MPNet implementation --- .gitignore | 3 +- index/conf.py | 31 +- index/db/__init__.py | 0 index/embedding.py | 32 +- index/evaluation.py | 254 ++++++++++--- index/main.py | 704 ++++++++++++++++++++++++++++-------- index/mapping.py | 99 +++-- index/{db => }/model.py | 9 +- index/parsing.py | 62 +++- index/visualisation.py | 266 +++++++++++--- requirements.txt | 19 +- tests/test_evaluation.py | 143 ++++++-- tests/test_parser.py | 19 +- tests/test_visualisation.py | 94 +++-- 14 files changed, 1371 insertions(+), 364 deletions(-) delete mode 100644 index/db/__init__.py rename index/{db => }/model.py (81%) diff --git a/.gitignore b/.gitignore index ff2ab48..feee596 100644 --- a/.gitignore +++ b/.gitignore @@ -161,4 +161,5 @@ cython_debug/ #.idea/ gptstew/.env!/gptstew/resources/ -.idea \ No newline at end of file +.idea +.vscode \ No newline at end of file diff --git a/index/conf.py b/index/conf.py index 824b22e..1b0bb3d 100644 --- a/index/conf.py +++ b/index/conf.py @@ -10,10 +10,29 @@ BIOFIND_DICT_SRC = "resources/dictionaries/pd/biofind.csv" BIOFIND_EMBEDDINGS_SRC = "resources/embeddings/biofind.csv" -COLORS_AD = {'adni': '#d62728', 'aibl': '#ff7f0e', 'emif': '#8c564b', 'jadni': '#7f7f7f', - 'a4': '#aec7e8', 'dod-adni': '#ffbb78', 'prevent-ad': '#98df8a', 'arwibo': '#ff9896', - 'i-adni': '#c5b0d5', 'edsd': '#c49c94', 'pharmacog': '#c7c7c7', - 'vita': '#bcbd22', 'abvib': '#e0d9e2', 'ad-mapper': '#800000'} +COLORS_AD = { + "adni": "#d62728", + "aibl": "#ff7f0e", + "emif": "#8c564b", + "jadni": "#7f7f7f", + "a4": "#aec7e8", + "dod-adni": "#ffbb78", + "prevent-ad": "#98df8a", + "arwibo": "#ff9896", + "i-adni": "#c5b0d5", + "edsd": "#c49c94", + "pharmacog": "#c7c7c7", + "vita": "#bcbd22", + "abvib": "#e0d9e2", + "ad-mapper": "#800000", +} -COLORS_PD = {'opdc': '#1f77b4', 'tpd': '#e377c2', 'biofind': '#9edae5', 'lrrk2': '#f7b6d2', 'luxpark': '#2ca02c', - 'ppmi': '#9467bd', 'passionate': '#00ff00'} +COLORS_PD = { + "opdc": "#1f77b4", + "tpd": "#e377c2", + "biofind": "#9edae5", + "lrrk2": "#f7b6d2", + "luxpark": "#2ca02c", + "ppmi": "#9467bd", + "passionate": "#00ff00", +} diff --git a/index/db/__init__.py b/index/db/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/index/embedding.py b/index/embedding.py index e60bc41..be42cc3 100644 --- a/index/embedding.py +++ b/index/embedding.py @@ -2,10 +2,10 @@ from abc import ABC import numpy as np import openai +from sentence_transformers import SentenceTransformer class EmbeddingModel(ABC): - def get_embedding(self, text: str) -> [float]: pass @@ -14,7 +14,6 @@ def get_embeddings(self, messages: [str]) -> [[float]]: class GPT4Adapter(EmbeddingModel): - def __init__(self, api_key: str): self.api_key = api_key openai.api_key = api_key @@ -28,7 +27,9 @@ def get_embedding(self, text: str, model="text-embedding-ada-002"): return None if isinstance(text, str): text = text.replace("\n", " ") - return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding'] + return openai.Embedding.create(input=[text], model=model)["data"][0][ + "embedding" + ] except Exception as e: logging.error(f"Error getting embedding for {text}: {e}") return None @@ -36,11 +37,32 @@ def get_embedding(self, text: str, model="text-embedding-ada-002"): def get_embeddings(self, messages: [str], model="text-embedding-ada-002"): # store index of nan entries response = openai.Embedding.create(input=messages, model=model) - return [item['embedding'] for item in response['data']] + return [item["embedding"] for item in response["data"]] -class TextEmbedding: +class MPNetAdapter(EmbeddingModel): + def __init__(self): + logging.getLogger().setLevel(logging.INFO) + def get_embedding(self, text: str, model="sentence-transformers/all-mpnet-base-v2"): + mpnet_model = SentenceTransformer(model) + logging.info(f"Getting embedding for {text}") + try: + if text is None or text == "" or text is np.nan: + logging.warn(f"Empty text passed to get_embedding") + return None + if isinstance(text, str): + text = text.replace("\n", " ") + return mpnet_model.encode(text) + except Exception as e: + logging.error(f"Error getting embedding for {text}: {e}") + return None + + def get_embeddings(self, messages: [str]) -> [[float]]: + return [self.get_embedding(msg) for msg in messages] + + +class TextEmbedding: def __init__(self, text: str, embedding: [float]): self.text = text self.embedding = embedding diff --git a/index/evaluation.py b/index/evaluation.py index e6a006c..8c9160b 100644 --- a/index/evaluation.py +++ b/index/evaluation.py @@ -3,17 +3,24 @@ from thefuzz import process import pandas as pd import numpy as np +from scipy.spatial import distance +from sklearn.metrics.pairwise import cosine_distances from index.mapping import MappingTable class MatchingMethod(Enum): - EUCLIDEAN_EMBEDDING_DISTANCE = 1, - FUZZY_STRING_MATCHING = 2 + EUCLIDEAN_EMBEDDING_DISTANCE = (1,) + FUZZY_STRING_MATCHING = (2,) + COSINE_EMBEDDING_DISTANCE = 3 -def enrichment_analysis(source_table: MappingTable, target_table: MappingTable, max_cumulative_match_rank: int = 10, - matching_method=MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE) -> np.ndarray: +def enrichment_analysis( + source_table: MappingTable, + target_table: MappingTable, + max_cumulative_match_rank: int = 10, + matching_method=MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE, +) -> np.ndarray: """ Calculate accuracy for the n the closest matches for two mapping tables @@ -29,17 +36,26 @@ def enrichment_analysis(source_table: MappingTable, target_table: MappingTable, # not every variable can be matched max_matches = 0 # clean up source and target table (missing embeddings, descriptions etc.) - source_table.joined_mapping_table.drop_duplicates(subset=['variable'], keep='first', inplace=True) + source_table.joined_mapping_table.drop_duplicates( + subset=["variable"], keep="first", inplace=True + ) source_table.joined_mapping_table.dropna(subset=["description"], inplace=True) target_table.joined_mapping_table.dropna(subset=["description"], inplace=True) - if matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE: + if ( + matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE + or matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE + ): source_table.joined_mapping_table.dropna(subset=["embedding"], inplace=True) target_table.joined_mapping_table.dropna(subset=["embedding"], inplace=True) # re-index to account for dropped rows - target_table.joined_mapping_table = target_table.joined_mapping_table.reset_index(drop=True) + target_table.joined_mapping_table = target_table.joined_mapping_table.reset_index( + drop=True + ) for idx, source_table_row in source_table.joined_mapping_table.iterrows(): correct_target_index = target_table.joined_mapping_table[ - target_table.joined_mapping_table["identifier"] == source_table_row["identifier"]].index + target_table.joined_mapping_table["identifier"] + == source_table_row["identifier"] + ].index if len(correct_target_index) == 0: # can not be matched -> skip continue @@ -51,23 +67,46 @@ def enrichment_analysis(source_table: MappingTable, target_table: MappingTable, if matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE: source_table_embedding = source_table_row["embedding"] target_table_embedding = target_table_row["embedding"] - distances.append(np.linalg.norm(np.array(source_table_embedding) - np.array(target_table_embedding))) + distances.append( + np.linalg.norm( + np.array(source_table_embedding) + - np.array(target_table_embedding) + ) + ) + elif matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE: + source_table_embedding = np.array(source_table_row["embedding"]) + target_table_embedding = np.array(target_table_row["embedding"]) + distances.append( + distance.cosine(source_table_embedding, target_table_embedding) + ) elif matching_method == MatchingMethod.FUZZY_STRING_MATCHING: source_table_description = source_table_row["description"] target_table_description = target_table_row["description"] - distances.append(100 - fuzz.ratio(source_table_description, target_table_description)) + distances.append( + 100 - fuzz.ratio(source_table_description, target_table_description) + ) else: - raise NotImplementedError("Specified matching method is not implemented!") - min_distance_indices = np.argsort(np.array(distances))[:max_cumulative_match_rank] + raise NotImplementedError( + "Specified matching method is not implemented!" + ) + min_distance_indices = np.argsort(np.array(distances))[ + :max_cumulative_match_rank + ] for n in range(max_cumulative_match_rank): # (due to upper level concepts) there may be more than one correct mapping - if any(element in min_distance_indices[:n+1] for element in correct_target_index): + if any( + element in min_distance_indices[: n + 1] + for element in correct_target_index + ): correct_matches[n] += 1 return (correct_matches / max_matches).round(2) -def match_closest_descriptions(source_table: MappingTable, target_table: MappingTable, - matching_method=MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE) -> pd.DataFrame: +def match_closest_descriptions( + source_table: MappingTable, + target_table: MappingTable, + matching_method=MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE, +) -> pd.DataFrame: """ Match descriptions from source table to target table based on the biggest similarity @@ -79,45 +118,87 @@ def match_closest_descriptions(source_table: MappingTable, target_table: Mapping """ # sometimes the same concept gets mapped against multiple concepts in CDM, resulting in artifacts in the results # -> drop duplicates, only keep first - source_table.joined_mapping_table.drop_duplicates(subset=['variable'], keep='first', inplace=True) + source_table.joined_mapping_table.drop_duplicates( + subset=["variable"], keep="first", inplace=True + ) # remove rows from source and target that do not contain either a description (in general) or embedding (for gpt) source_table.joined_mapping_table.dropna(subset=["description"], inplace=True) target_table.joined_mapping_table.dropna(subset=["description"], inplace=True) - if matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE: + if ( + matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE + or matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE + ): source_table.joined_mapping_table.dropna(subset=["embedding"], inplace=True) target_table.joined_mapping_table.dropna(subset=["embedding"], inplace=True) # method -> compute distance based on embeddings - if matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE: - if "embedding" not in source_table.joined_mapping_table.columns \ - or "embedding" not in target_table.joined_mapping_table.columns: + if ( + matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE + or matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE + ): + if ( + "embedding" not in source_table.joined_mapping_table.columns + or "embedding" not in target_table.joined_mapping_table.columns + ): raise ValueError("Mapping tables must contain an 'embedding' column") # re-index to account for dropped rows - target_table.joined_mapping_table = target_table.joined_mapping_table.reset_index(drop=True) + target_table.joined_mapping_table = target_table.joined_mapping_table.reset_index( + drop=True + ) # METHOD: Euclidean Distance based on embeddings if matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE: - if "embedding" not in source_table.joined_mapping_table.columns \ - or "embedding" not in target_table.joined_mapping_table.columns: + if ( + "embedding" not in source_table.joined_mapping_table.columns + or "embedding" not in target_table.joined_mapping_table.columns + ): + raise ValueError("Mapping tables must contain an 'embedding' column") + source_embeddings = source_table.get_embeddings_numpy() + target_embeddings = target_table.get_embeddings_numpy() + distance_matrix = np.linalg.norm( + source_embeddings[:, np.newaxis] - target_embeddings, axis=-1 + ) + closest_indices = np.argmin(distance_matrix, axis=1) + distances = np.min(distance_matrix, axis=1) + matched_target_descriptions = target_table.joined_mapping_table.loc[ + closest_indices, "description" + ].tolist() + # METHOD: Cosine Distance based on embeddings + elif matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE: + if ( + "embedding" not in source_table.joined_mapping_table.columns + or "embedding" not in target_table.joined_mapping_table.columns + ): raise ValueError("Mapping tables must contain an 'embedding' column") source_embeddings = source_table.get_embeddings_numpy() target_embeddings = target_table.get_embeddings_numpy() - distance_matrix = np.linalg.norm(source_embeddings[:, np.newaxis] - target_embeddings, axis=-1) + distance_matrix = cosine_distances(source_embeddings, target_embeddings) closest_indices = np.argmin(distance_matrix, axis=1) distances = np.min(distance_matrix, axis=1) - matched_target_descriptions = target_table.joined_mapping_table.loc[closest_indices, 'description'].tolist() + matched_target_descriptions = target_table.joined_mapping_table.loc[ + closest_indices, "description" + ].tolist() # METHOD: Fuzzy String Matching based on Levenstein Distance elif matching_method == MatchingMethod.FUZZY_STRING_MATCHING: - if "description" not in source_table.joined_mapping_table.columns \ - or "description" not in target_table.joined_mapping_table.columns: + if ( + "description" not in source_table.joined_mapping_table.columns + or "description" not in target_table.joined_mapping_table.columns + ): raise ValueError("Mapping tables must contain an 'description' column") - source_descriptions = source_table.joined_mapping_table["description"].to_numpy() - target_descriptions = target_table.joined_mapping_table["description"].to_numpy() - target_descriptions_dict = {idx: el for idx, el in enumerate(target_descriptions)} + source_descriptions = source_table.joined_mapping_table[ + "description" + ].to_numpy() + target_descriptions = target_table.joined_mapping_table[ + "description" + ].to_numpy() + target_descriptions_dict = { + idx: el for idx, el in enumerate(target_descriptions) + } closest_indices = [] distances = [] matched_target_descriptions = [] for source_description in source_descriptions: - matched_target_description, distance, target_idx = process.extractOne(source_description, - target_descriptions_dict) + matched_target_description, distance, target_idx = process.extractOne( + source_description, target_descriptions_dict + ) closest_indices.append(target_idx) matched_target_descriptions.append(matched_target_description) # it is not a distance but a score [0,100] in this case -> take inverse (+1 to avoid division by 0) @@ -126,26 +207,37 @@ def match_closest_descriptions(source_table: MappingTable, target_table: Mapping else: raise ValueError("Specified Matching method is not implemented!") source_concept_label = source_table.joined_mapping_table["identifier"] - target_concept_label = target_table.joined_mapping_table.loc[closest_indices, 'identifier'].tolist() + target_concept_label = target_table.joined_mapping_table.loc[ + closest_indices, "identifier" + ].tolist() source_variable = source_table.joined_mapping_table["variable"] - target_variable = target_table.joined_mapping_table.loc[closest_indices, 'variable'].tolist() + target_variable = target_table.joined_mapping_table.loc[ + closest_indices, "variable" + ].tolist() correct = source_concept_label == target_concept_label - ground_truth_target_descriptions = get_ground_truth_target_descriptions(source_table.joined_mapping_table, - target_table.joined_mapping_table) + ground_truth_target_descriptions = get_ground_truth_target_descriptions( + source_table.joined_mapping_table, target_table.joined_mapping_table + ) source_descriptions = source_table.joined_mapping_table["description"] - result = pd.DataFrame({"correct": correct, - "source_variable": source_variable, - "target_variable": target_variable, - "source_concept_label": source_concept_label, - "target_concept_label": target_concept_label, - "source_description": source_descriptions, - "matched_target_description": matched_target_descriptions, - "ground_truth_target_description": ground_truth_target_descriptions, - "distance": distances}) + result = pd.DataFrame( + { + "correct": correct, + "source_variable": source_variable, + "target_variable": target_variable, + "source_concept_label": source_concept_label, + "target_concept_label": target_concept_label, + "source_description": source_descriptions, + "matched_target_description": matched_target_descriptions, + "ground_truth_target_description": ground_truth_target_descriptions, + "distance": distances, + } + ) return result -def get_ground_truth_target_descriptions(source_table: pd.DataFrame, target_table: pd.DataFrame) -> np.ndarray[str]: +def get_ground_truth_target_descriptions( + source_table: pd.DataFrame, target_table: pd.DataFrame +) -> np.ndarray[str]: """ Get the ground truth target descriptions based on the matched identifiers @@ -157,7 +249,9 @@ def get_ground_truth_target_descriptions(source_table: pd.DataFrame, target_tabl descriptions = [] for source_id in source_table["identifier"]: try: - target_description = target_table.loc[target_table["identifier"] == source_id, "description"].iloc[0] + target_description = target_table.loc[ + target_table["identifier"] == source_id, "description" + ].iloc[0] descriptions.append(target_description) except IndexError: descriptions.append(None) @@ -177,3 +271,69 @@ def score_mappings(matches: pd.DataFrame) -> float: matches = matches[matches["target_concept_label"].notnull()] accuracy = matches["correct"].sum() / len(matches) return accuracy + + +def evaluate( + datasets, + labels, + store_results=False, + model="gpt", + results_root_dir="resources/results/pd", +): + + if model == "gpt": + data_gpt = {} + data_fuzzy = {} + for idx, source in enumerate(datasets): + acc_gpt = [] + acc_fuzzy = [] + for idy, target in enumerate(datasets): + map_gpt = match_closest_descriptions(source, target) + map_fuzzy = match_closest_descriptions( + source, target, matching_method=MatchingMethod.FUZZY_STRING_MATCHING + ) + if target == "jadni": + print("check") + if store_results: + map_gpt.to_excel( + results_root_dir + + "/gpt_" + + f"{labels[idx]}_to_{labels[idy]}.xlsx" + ) + map_fuzzy.to_excel( + results_root_dir + + "/fuzzy_" + + f"{labels[idx]}_to_{labels[idy]}.xlsx" + ) + acc_gpt.append(round(score_mappings(map_gpt), 2)) + acc_fuzzy.append(round(score_mappings(map_fuzzy), 2)) + data_gpt[labels[idx]] = acc_gpt + data_fuzzy[labels[idx]] = acc_fuzzy + # transpose to have from -> to | row -> column like in the paper + gpt = pd.DataFrame(data_gpt, index=labels).T + fuzzy = pd.DataFrame(data_fuzzy, index=labels).T + return gpt, fuzzy + + elif model == "mpnet": + data_mpnet = {} + for idx, source in enumerate(datasets): + acc_mpnet = [] + for idy, target in enumerate(datasets): + map_mpnet = match_closest_descriptions( + source, + target, + matching_method=MatchingMethod.COSINE_EMBEDDING_DISTANCE, + ) + if target == "jadni": + print("check") + if store_results: + map_mpnet.to_excel( + results_root_dir + + "/mpnet_" + + f"{labels[idx]}_to_{labels[idy]}.xlsx" + ) + acc_mpnet.append(round(score_mappings(map_mpnet), 2)) + data_mpnet[labels[idx]] = acc_mpnet + # transpose to have from -> to | row -> column like in the paper + mpnet = pd.DataFrame(data_mpnet, index=labels).T + return mpnet diff --git a/index/main.py b/index/main.py index 9684daf..08dbe96 100644 --- a/index/main.py +++ b/index/main.py @@ -1,189 +1,605 @@ import os +import sys +sys.path.append("../") import pandas as pd from index import evaluation -from index.conf import PD_CDM_SRC, PPMI_DICT_SRC, LUXPARK_DICT_SRC, BIOFIND_DICT_SRC, AD_CDM_SRC -from index.embedding import GPT4Adapter -from index.evaluation import match_closest_descriptions, MatchingMethod, enrichment_analysis +from index.conf import ( + PD_CDM_SRC, + PPMI_DICT_SRC, + LUXPARK_DICT_SRC, + BIOFIND_DICT_SRC, + AD_CDM_SRC, +) +from index.embedding import GPT4Adapter, MPNetAdapter +from index.evaluation import ( + match_closest_descriptions, + MatchingMethod, + enrichment_analysis, + evaluate, +) from index.mapping import MappingTable from index.parsing import MappingSource, DataDictionarySource from dotenv import load_dotenv -from index.visualisation import scatter_plot_two_distributions, enrichment_plot, scatter_plot_all_cohorts +from index.visualisation import ( + scatter_plot_two_distributions, + enrichment_plot, + scatter_plot_all_cohorts, + bar_chart_average_acc_two_distributions, +) EVAL_PD = True EVAL_AD = True load_dotenv() -gpt4 = GPT4Adapter(api_key=os.getenv('GPT_KEY')) - - -def evaluate(datasets, labels, store_results=False, results_root_dir="resources/results/pd"): - data_gpt = {} - data_fuzzy = {} - for idx, source in enumerate(datasets): - acc_gpt = [] - acc_fuzzy = [] - for idy, target in enumerate(datasets): - map_gpt = match_closest_descriptions(source, target) - map_fuzzy = match_closest_descriptions(source, target, matching_method=MatchingMethod.FUZZY_STRING_MATCHING) - if target == "jadni": - print("check") - if store_results: - map_gpt.to_excel(results_root_dir + "/gpt_" + f"{labels[idx]}_to_{labels[idy]}.xlsx") - map_fuzzy.to_excel(results_root_dir + "/fuzzy_" + f"{labels[idx]}_to_{labels[idy]}.xlsx") - acc_gpt.append(round(evaluation.score_mappings(map_gpt), 2)) - acc_fuzzy.append(round(evaluation.score_mappings(map_fuzzy), 2)) - data_gpt[labels[idx]] = acc_gpt - data_fuzzy[labels[idx]] = acc_fuzzy - # transpose to have from -> to | row -> column like in the paper - gpt = pd.DataFrame(data_gpt, index=labels).T - fuzzy = pd.DataFrame(data_fuzzy, index=labels).T - return gpt, fuzzy - +gpt4 = GPT4Adapter(api_key=os.getenv("GPT_KEY")) # type: ignore +mpnet = MPNetAdapter() # PD Mappings if EVAL_PD: - cdm_pd = MappingTable(MappingSource(PD_CDM_SRC, "Feature", "CURIE")) - cdm_pd.joined_mapping_table["identifier"].to_csv("resources/cdm_curie.csv", index=False) - cdm_pd.add_descriptions(DataDictionarySource(PD_CDM_SRC, "Feature", "Definition")) - cdm_pd.compute_embeddings(gpt4) - - ppmi = MappingTable(MappingSource(PD_CDM_SRC, "PPMI", "CURIE")) - ppmi.add_descriptions(DataDictionarySource(PPMI_DICT_SRC, "ITM_NAME", "DSCR")) - ppmi.compute_embeddings(gpt4) - - luxpark = MappingTable(MappingSource(PD_CDM_SRC, "LuxPARK", "CURIE")) - luxpark.add_descriptions(DataDictionarySource(LUXPARK_DICT_SRC, "Variable / Field Name", "Field Label")) - luxpark.compute_embeddings(gpt4) - - biofind = MappingTable(MappingSource(PD_CDM_SRC, "BIOFIND", "CURIE")) - biofind.add_descriptions(DataDictionarySource(BIOFIND_DICT_SRC, "ITM_NAME", "DSCR")) - biofind.compute_embeddings(gpt4) - - lrrk2 = MappingTable(MappingSource(PD_CDM_SRC, "LRRK2", "CURIE")) - lrrk2.add_descriptions(DataDictionarySource("resources/dictionaries/pd/LRRK2.xlsx", "Variable", "Label")) - lrrk2.compute_embeddings(gpt4) - - opdc = MappingTable(MappingSource(PD_CDM_SRC, "OPDC", "CURIE")) - opdc.add_descriptions( - DataDictionarySource("resources/dictionaries/pd/OPDC.csv", "Variable Name", "Variable description")) - opdc.compute_embeddings(gpt4) - - tpd = MappingTable(MappingSource(PD_CDM_SRC, "TPD", "CURIE")) - tpd.add_descriptions( - DataDictionarySource("resources/dictionaries/pd/TPD.csv", "Variable Name", "Variable description")) - tpd.compute_embeddings(gpt4) - - pd_datesets = [opdc, tpd, biofind, lrrk2, luxpark, ppmi, cdm_pd] - pd_datasets_labels = ["OPDC", "TPD", "Biofind", "LRRK2", "LuxPARK", "PPMI", "PASSIONATE"] + cdm_pd_gpt = MappingTable(MappingSource(PD_CDM_SRC, "Feature", "CURIE")) + cdm_pd_gpt.joined_mapping_table["identifier"].to_csv( + "resources/cdm_curie.csv", index=False + ) + cdm_pd_gpt.add_descriptions( + DataDictionarySource(PD_CDM_SRC, "Feature", "Definition") + ) + cdm_pd_gpt.compute_embeddings(gpt4) + + cdm_pd_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "Feature", "CURIE")) + cdm_pd_mpnet.joined_mapping_table["identifier"].to_csv( + "resources/cdm_curie.csv", index=False + ) + cdm_pd_mpnet.add_descriptions( + DataDictionarySource(PD_CDM_SRC, "Feature", "Definition") + ) + cdm_pd_mpnet.compute_embeddings(mpnet) + + ppmi_gpt = MappingTable(MappingSource(PD_CDM_SRC, "PPMI", "CURIE")) + ppmi_gpt.add_descriptions(DataDictionarySource(PPMI_DICT_SRC, "ITM_NAME", "DSCR")) + ppmi_gpt.compute_embeddings(gpt4) + + ppmi_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "PPMI", "CURIE")) + ppmi_mpnet.add_descriptions(DataDictionarySource(PPMI_DICT_SRC, "ITM_NAME", "DSCR")) + ppmi_mpnet.compute_embeddings(mpnet) + + luxpark_gpt = MappingTable(MappingSource(PD_CDM_SRC, "LuxPARK", "CURIE")) + luxpark_gpt.add_descriptions( + DataDictionarySource(LUXPARK_DICT_SRC, "Variable / Field Name", "Field Label") + ) + luxpark_gpt.compute_embeddings(gpt4) + + luxpark_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "LuxPARK", "CURIE")) + luxpark_mpnet.add_descriptions( + DataDictionarySource(LUXPARK_DICT_SRC, "Variable / Field Name", "Field Label") + ) + luxpark_mpnet.compute_embeddings(mpnet) + + biofind_gpt = MappingTable(MappingSource(PD_CDM_SRC, "BIOFIND", "CURIE")) + biofind_gpt.add_descriptions( + DataDictionarySource(BIOFIND_DICT_SRC, "ITM_NAME", "DSCR") + ) + biofind_gpt.compute_embeddings(gpt4) + + biofind_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "BIOFIND", "CURIE")) + biofind_mpnet.add_descriptions( + DataDictionarySource(BIOFIND_DICT_SRC, "ITM_NAME", "DSCR") + ) + biofind_mpnet.compute_embeddings(mpnet) + + lrrk2_gpt = MappingTable(MappingSource(PD_CDM_SRC, "LRRK2", "CURIE")) + lrrk2_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/pd/LRRK2.xlsx", "Variable", "Label" + ) + ) + lrrk2_gpt.compute_embeddings(gpt4) + + lrrk2_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "LRRK2", "CURIE")) + lrrk2_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/pd/LRRK2.xlsx", "Variable", "Label" + ) + ) + lrrk2_mpnet.compute_embeddings(mpnet) + + opdc_gpt = MappingTable(MappingSource(PD_CDM_SRC, "OPDC", "CURIE")) + opdc_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/pd/OPDC.csv", + "Variable Name", + "Variable description", + ) + ) + opdc_gpt.compute_embeddings(gpt4) + + opdc_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "OPDC", "CURIE")) + opdc_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/pd/OPDC.csv", + "Variable Name", + "Variable description", + ) + ) + opdc_mpnet.compute_embeddings(mpnet) + + tpd_gpt = MappingTable(MappingSource(PD_CDM_SRC, "TPD", "CURIE")) + tpd_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/pd/TPD.csv", + "Variable Name", + "Variable description", + ) + ) + tpd_gpt.compute_embeddings(gpt4) + + tpd_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "TPD", "CURIE")) + tpd_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/pd/TPD.csv", + "Variable Name", + "Variable description", + ) + ) + tpd_mpnet.compute_embeddings(mpnet) + + pd_datasets_gpt = [ + opdc_gpt, + tpd_gpt, + biofind_gpt, + lrrk2_gpt, + luxpark_gpt, + ppmi_gpt, + cdm_pd_gpt, + ] + pd_datasets_mpnet = [ + opdc_mpnet, + tpd_mpnet, + biofind_mpnet, + lrrk2_mpnet, + luxpark_mpnet, + ppmi_mpnet, + cdm_pd_mpnet, + ] + pd_datasets_labels = [ + "OPDC", + "PRoBaND", + "BIOFIND", + "LCC", + "LuxPARK", + "PPMI", + "PASSIONATE", + ] # enrichment analysis - luxpark_passionate_enrichment_gpt = enrichment_analysis(luxpark, cdm_pd, 20, - MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE) - luxpark_passionate_enrichment_fuzzy = enrichment_analysis(luxpark, cdm_pd, 20, MatchingMethod.FUZZY_STRING_MATCHING) + luxpark_passionate_enrichment_gpt = enrichment_analysis( + luxpark_gpt, cdm_pd_gpt, 20, MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE + ) + luxpark_passionate_enrichment_mpnet = enrichment_analysis( + luxpark_mpnet, cdm_pd_mpnet, 20, MatchingMethod.COSINE_EMBEDDING_DISTANCE + ) + luxpark_passionate_enrichment_fuzzy = enrichment_analysis( + luxpark_gpt, cdm_pd_gpt, 20, MatchingMethod.FUZZY_STRING_MATCHING + ) label1 = "Enrichment Plot LuxPARK to CDM" - ppmi_passionate_enrichment_gpt = enrichment_analysis(ppmi, cdm_pd, 20, MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE) - ppmi_passionate_enrichment_fuzzy = enrichment_analysis(ppmi, cdm_pd, 20, MatchingMethod.FUZZY_STRING_MATCHING) + ppmi_passionate_enrichment_gpt = enrichment_analysis( + ppmi_gpt, cdm_pd_gpt, 20, MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE + ) + ppmi_passionate_enrichment_mpnet = enrichment_analysis( + ppmi_mpnet, cdm_pd_mpnet, 20, MatchingMethod.COSINE_EMBEDDING_DISTANCE + ) + ppmi_passionate_enrichment_fuzzy = enrichment_analysis( + ppmi_gpt, cdm_pd_gpt, 20, MatchingMethod.FUZZY_STRING_MATCHING + ) label2 = "Enrichment Plot PPMI to CDM" - enrichment_plot(luxpark_passionate_enrichment_gpt, luxpark_passionate_enrichment_fuzzy, label1, save_plot=True) - enrichment_plot(ppmi_passionate_enrichment_gpt, ppmi_passionate_enrichment_fuzzy, label2, save_plot=True) + ppmi_luxpark_enrichment_gpt = enrichment_analysis( + ppmi_gpt, luxpark_gpt, 20, MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE + ) + ppmi_luxpark_enrichment_mpnet = enrichment_analysis( + ppmi_mpnet, luxpark_mpnet, 20, MatchingMethod.COSINE_EMBEDDING_DISTANCE + ) + ppmi_luxpark_enrichment_fuzzy = enrichment_analysis( + ppmi_gpt, luxpark_gpt, 20, MatchingMethod.FUZZY_STRING_MATCHING + ) + label3 = "Enrichment Plot PPMI to LuxPARK" + enrichment_plot( + luxpark_passionate_enrichment_gpt, + luxpark_passionate_enrichment_mpnet, + luxpark_passionate_enrichment_fuzzy, + label1, + save_plot=True, + ) + enrichment_plot( + ppmi_passionate_enrichment_gpt, + ppmi_passionate_enrichment_mpnet, + ppmi_passionate_enrichment_fuzzy, + label2, + save_plot=True, + ) + enrichment_plot( + ppmi_luxpark_enrichment_gpt, + ppmi_luxpark_enrichment_mpnet, + ppmi_luxpark_enrichment_fuzzy, + label3, + save_plot=True, + ) print(luxpark_passionate_enrichment_gpt) + print(luxpark_passionate_enrichment_mpnet) print(luxpark_passionate_enrichment_fuzzy) print(ppmi_passionate_enrichment_gpt) + print(ppmi_passionate_enrichment_mpnet) print(ppmi_passionate_enrichment_fuzzy) + print(ppmi_luxpark_enrichment_gpt) + print(ppmi_luxpark_enrichment_mpnet) + print(ppmi_luxpark_enrichment_fuzzy) + + gpt_table1, fuzzy_table1 = evaluate( + pd_datasets_gpt, + pd_datasets_labels, + store_results=True, + model="gpt", + results_root_dir="./resources/results/pd", + ) + + mpnet_table1 = evaluate( + pd_datasets_mpnet, + pd_datasets_labels, + store_results=True, + model="mpnet", + results_root_dir="./resources/results/pd", + ) - gpt_table, fuzzy_table = evaluate(pd_datesets, pd_datasets_labels) print("PD RESULTS:") - print(gpt_table) + print("GPT") + print("-----------") + print(gpt_table1) print("-----------") - print(fuzzy_table) + print("MPNet") + print("-----------") + print(mpnet_table1) + print("-----------") + print("Fuzzy") + print("-----------") + print(fuzzy_table1) print("-----------") # AD Mappings if EVAL_AD: - cdm_ad = cdm_pd = MappingTable(MappingSource(AD_CDM_SRC, "Feature", "CURIE")) - cdm_ad.add_descriptions(DataDictionarySource(PD_CDM_SRC, "Feature", "Definition")) - cdm_ad.compute_embeddings(gpt4) - - a4 = MappingTable(MappingSource(AD_CDM_SRC, "A4", "CURIE")) - a4.add_descriptions(DataDictionarySource("resources/dictionaries/ad/a4.csv", "FLDNAME", "TEXT")) - a4.compute_embeddings(gpt4) - - abvib = MappingTable(MappingSource(AD_CDM_SRC, "ABVIB", "CURIE")) - abvib.add_descriptions(DataDictionarySource("resources/dictionaries/ad/abvib.csv", "variable_name", "description")) - abvib.compute_embeddings(gpt4) - - adni = MappingTable(MappingSource(AD_CDM_SRC, "ADNI", "CURIE")) - adni.add_descriptions(DataDictionarySource("resources/dictionaries/ad/adni.csv", "FLDNAME", "TEXT")) - adni.compute_embeddings(gpt4) - - aibl = MappingTable(MappingSource(AD_CDM_SRC, "AIBL", "CURIE")) - aibl.add_descriptions(DataDictionarySource("resources/dictionaries/ad/aibl.csv", "Name", "Description")) - aibl.compute_embeddings(gpt4) - - arwibo = MappingTable(MappingSource(AD_CDM_SRC, "ARWIBO", "CURIE")) - arwibo.add_descriptions( - DataDictionarySource("resources/dictionaries/ad/arwibo.csv", "Variable_Name", "Element_description")) - arwibo.compute_embeddings(gpt4) - - dod_adni = MappingTable(MappingSource(AD_CDM_SRC, "DOD-ADNI", "CURIE")) + cdm_ad_gpt = cdm_pd_gpt = MappingTable( + MappingSource(AD_CDM_SRC, "Feature", "CURIE") + ) + cdm_ad_gpt.add_descriptions( + DataDictionarySource(PD_CDM_SRC, "Feature", "Definition") + ) + cdm_ad_gpt.compute_embeddings(gpt4) + + cdm_ad_mpnet = cdm_pd_gpt = MappingTable( + MappingSource(AD_CDM_SRC, "Feature", "CURIE") + ) + cdm_ad_mpnet.add_descriptions( + DataDictionarySource(PD_CDM_SRC, "Feature", "Definition") + ) + cdm_ad_mpnet.compute_embeddings(mpnet) + + a4_gpt = MappingTable(MappingSource(AD_CDM_SRC, "A4", "CURIE")) + a4_gpt.add_descriptions( + DataDictionarySource("resources/dictionaries/ad/a4.csv", "FLDNAME", "TEXT") + ) + a4_gpt.compute_embeddings(gpt4) + + a4_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "A4", "CURIE")) + a4_mpnet.add_descriptions( + DataDictionarySource("resources/dictionaries/ad/a4.csv", "FLDNAME", "TEXT") + ) + a4_mpnet.compute_embeddings(mpnet) + + abvib_gpt = MappingTable(MappingSource(AD_CDM_SRC, "ABVIB", "CURIE")) + abvib_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/abvib.csv", + "variable_name", + "description", + ) + ) + abvib_gpt.compute_embeddings(gpt4) + + abvib_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "ABVIB", "CURIE")) + abvib_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/abvib.csv", + "variable_name", + "description", + ) + ) + abvib_mpnet.compute_embeddings(mpnet) + + adni_gpt = MappingTable(MappingSource(AD_CDM_SRC, "ADNI", "CURIE")) + adni_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/ADNIMERGE_DICT_27Nov2023 2.csv", + "FLDNAME", + "TEXT", + ) + ) + adni_gpt.compute_embeddings(gpt4) + + adni_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "ADNI", "CURIE")) + adni_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/ADNIMERGE_DICT_27Nov2023 2.csv", + "FLDNAME", + "TEXT", + ) + ) + adni_mpnet.compute_embeddings(mpnet) + + aibl_gpt = MappingTable(MappingSource(AD_CDM_SRC, "AIBL", "CURIE")) + aibl_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/aibl.csv", "Name", "Description" + ) + ) + aibl_gpt.compute_embeddings(gpt4) + + aibl_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "AIBL", "CURIE")) + aibl_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/aibl.csv", "Name", "Description" + ) + ) + aibl_mpnet.compute_embeddings(mpnet) + + arwibo_gpt = MappingTable(MappingSource(AD_CDM_SRC, "ARWIBO", "CURIE")) + arwibo_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/arwibo.csv", + "Variable_Name", + "Element_description", + ) + ) + arwibo_gpt.compute_embeddings(gpt4) + + arwibo_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "ARWIBO", "CURIE")) + arwibo_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/arwibo.csv", + "Variable_Name", + "Element_description", + ) + ) + arwibo_mpnet.compute_embeddings(mpnet) + + dod_adni_gpt = MappingTable(MappingSource(AD_CDM_SRC, "DOD-ADNI", "CURIE")) # TODO most descriptions missing - dod_adni.add_descriptions(DataDictionarySource("resources/dictionaries/ad/dod-adni.csv", "FLDNAME", "TEXT")) - dod_adni.compute_embeddings(gpt4) - - edsd = MappingTable(MappingSource(AD_CDM_SRC, "EDSD", "CURIE")) - edsd.add_descriptions( - DataDictionarySource("resources/dictionaries/ad/edsd.xlsx", "Variable_Name", "Element_description")) - edsd.compute_embeddings(gpt4) - - emif = MappingTable(MappingSource(AD_CDM_SRC, "EMIF", "CURIE")) - emif.add_descriptions(DataDictionarySource("resources/dictionaries/ad/emif.xlsx", "Variable", "Description")) - emif.compute_embeddings(gpt4) - - i_adni = MappingTable(MappingSource(AD_CDM_SRC, "I-ADNI", "CURIE")) + dod_adni_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/dod-adni.csv", "FLDNAME", "TEXT" + ) + ) + dod_adni_gpt.compute_embeddings(gpt4) + + dod_adni_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "DOD-ADNI", "CURIE")) + # TODO most descriptions missing + dod_adni_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/dod-adni.csv", "FLDNAME", "TEXT" + ) + ) + dod_adni_mpnet.compute_embeddings(mpnet) + + edsd_gpt = MappingTable(MappingSource(AD_CDM_SRC, "EDSD", "CURIE")) + edsd_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/edsd.xlsx", + "Variable_Name", + "Element_description", + ) + ) + edsd_gpt.compute_embeddings(gpt4) + + edsd_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "EDSD", "CURIE")) + edsd_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/edsd.xlsx", + "Variable_Name", + "Element_description", + ) + ) + edsd_mpnet.compute_embeddings(mpnet) + + emif_gpt = MappingTable(MappingSource(AD_CDM_SRC, "EMIF", "CURIE")) + emif_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/emif.xlsx", "Variable", "Description" + ) + ) + emif_gpt.compute_embeddings(gpt4) + + emif_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "EMIF", "CURIE")) + emif_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/emif.xlsx", "Variable", "Description" + ) + ) + emif_mpnet.compute_embeddings(mpnet) + + i_adni_gpt = MappingTable(MappingSource(AD_CDM_SRC, "I-ADNI", "CURIE")) # TODO about half of descriptions missing - i_adni.add_descriptions(DataDictionarySource("resources/dictionaries/ad/i-adni.csv", "acronym", "variable")) - i_adni.compute_embeddings(gpt4) - - jadni = MappingTable(MappingSource(AD_CDM_SRC, "JADNI", "CURIE")) - jadni.add_descriptions(DataDictionarySource("resources/dictionaries/ad/jadni.tsv", "FLDNAME", "TEXT")) - jadni.compute_embeddings(gpt4) - - pharmacog = MappingTable(MappingSource(AD_CDM_SRC, "PharmaCog", "CURIE")) - pharmacog.add_descriptions( - DataDictionarySource("resources/dictionaries/ad/pharmacog.csv", "Variable_Name", "Element_description")) - pharmacog.compute_embeddings(gpt4) - - prevent_ad = MappingTable(MappingSource(AD_CDM_SRC, "PREVENT-AD", "CURIE")) - prevent_ad.add_descriptions( - DataDictionarySource("resources/dictionaries/ad/prevent-ad.csv", "variable", "description")) - prevent_ad.compute_embeddings(gpt4) - - vita = MappingTable(MappingSource(AD_CDM_SRC, "VITA", "CURIE")) - vita.add_descriptions( - DataDictionarySource("resources/dictionaries/ad/vita.csv", "Variable_Name", "Element_description")) - vita.compute_embeddings(gpt4) + i_adni_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/i-adni.csv", "acronym", "variable" + ) + ) + i_adni_gpt.compute_embeddings(gpt4) + + i_adni_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "I-ADNI", "CURIE")) + # TODO about half of descriptions missing + i_adni_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/i-adni.csv", "acronym", "variable" + ) + ) + i_adni_mpnet.compute_embeddings(mpnet) + + jadni_gpt = MappingTable(MappingSource(AD_CDM_SRC, "JADNI", "CURIE")) + jadni_gpt.add_descriptions( + DataDictionarySource("resources/dictionaries/ad/jadni.tsv", "FLDNAME", "TEXT") + ) + jadni_gpt.compute_embeddings(gpt4) + + jadni_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "JADNI", "CURIE")) + jadni_mpnet.add_descriptions( + DataDictionarySource("resources/dictionaries/ad/jadni.tsv", "FLDNAME", "TEXT") + ) + jadni_mpnet.compute_embeddings(mpnet) + + pharmacog_gpt = MappingTable(MappingSource(AD_CDM_SRC, "PharmaCog", "CURIE")) + pharmacog_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/pharmacog.csv", + "Variable_Name", + "Element_description", + ) + ) + pharmacog_gpt.compute_embeddings(gpt4) + + pharmacog_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "PharmaCog", "CURIE")) + pharmacog_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/pharmacog.csv", + "Variable_Name", + "Element_description", + ) + ) + pharmacog_mpnet.compute_embeddings(mpnet) + + prevent_ad_gpt = MappingTable(MappingSource(AD_CDM_SRC, "PREVENT-AD", "CURIE")) + prevent_ad_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/prevent-ad.csv", + "variable", + "description", + ) + ) + prevent_ad_gpt.compute_embeddings(gpt4) + + prevent_ad_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "PREVENT-AD", "CURIE")) + prevent_ad_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/prevent-ad.csv", + "variable", + "description", + ) + ) + prevent_ad_mpnet.compute_embeddings(mpnet) + + vita_gpt = MappingTable(MappingSource(AD_CDM_SRC, "VITA", "CURIE")) + vita_gpt.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/vita.csv", + "Variable_Name", + "Element_description", + ) + ) + vita_gpt.compute_embeddings(gpt4) + + vita_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "VITA", "CURIE")) + vita_mpnet.add_descriptions( + DataDictionarySource( + "resources/dictionaries/ad/vita.csv", + "Variable_Name", + "Element_description", + ) + ) + vita_mpnet.compute_embeddings(mpnet) wmh_ad = MappingTable(MappingSource(AD_CDM_SRC, "VITA", "CURIE")) - ad_datasets = [a4, abvib, adni, aibl, arwibo, dod_adni, edsd, emif, i_adni, jadni, pharmacog, prevent_ad, vita, - cdm_ad] - ad_datasets_labels = ["A4", "Abvib", "ADNI", "AIBL", "ARWIBO", "DOD-ADNI", "EDSD", "EMIF", "I-ADNI", "JADNI", - "PharmaCog", "PREVENT-AD", "VITA", "AD-Mapper"] - gpt_table, fuzzy_table = evaluate(ad_datasets, ad_datasets_labels) + ad_datasets_gpt = [ + a4_gpt, + abvib_gpt, + adni_gpt, + aibl_gpt, + arwibo_gpt, + dod_adni_gpt, + edsd_gpt, + emif_gpt, + i_adni_gpt, + jadni_gpt, + pharmacog_gpt, + prevent_ad_gpt, + vita_gpt, + cdm_ad_gpt, + ] + + ad_datasets_mpnet = [ + a4_mpnet, + abvib_mpnet, + adni_mpnet, + aibl_mpnet, + arwibo_mpnet, + dod_adni_mpnet, + edsd_mpnet, + emif_mpnet, + i_adni_mpnet, + jadni_mpnet, + pharmacog_mpnet, + prevent_ad_mpnet, + vita_mpnet, + cdm_ad_mpnet, + ] + ad_datasets_labels = [ + "A4", + "ABVIB", + "ADNI", + "AIBL", + "ARWIBO", + "DOD-ADNI", + "EDSD", + "EMIF", + "I-ADNI", + "JADNI", + "PharmaCog", + "PREVENT-AD", + "VITA", + "AD-Mapper", + ] + gpt_table2, fuzzy_table2 = evaluate( + ad_datasets_gpt, + ad_datasets_labels, + store_results=True, + model="gpt", + results_root_dir="resources/results/ad", + ) + + mpnet_table2 = evaluate( + ad_datasets_mpnet, + ad_datasets_labels, + store_results=True, + model="mpnet", + results_root_dir="resources/results/ad", + ) print("AD RESULTS:") - print(gpt_table.to_string()) + print("GPT") + print("-----------") + print(gpt_table2.to_string()) + print("-----------") + print("MPNet") print("-----------") - print(fuzzy_table.to_string()) + print(mpnet_table2.to_string()) + print("-----------") + print("Fuzzy") + print("-----------") + print(fuzzy_table2.to_string()) print("-----------") # embedding distribution -scatter_plot_two_distributions(pd_datesets, ad_datasets, "PD", "AD") -scatter_plot_all_cohorts(pd_datesets, ad_datasets, pd_datasets_labels, ad_datasets_labels) - - +scatter_plot_two_distributions(pd_datasets_gpt, ad_datasets_gpt, "PD", "AD") +scatter_plot_all_cohorts( + pd_datasets_gpt, ad_datasets_gpt, pd_datasets_labels, ad_datasets_labels +) diff --git a/index/mapping.py b/index/mapping.py index da3ba62..5f35013 100644 --- a/index/mapping.py +++ b/index/mapping.py @@ -2,16 +2,19 @@ import numpy as np from index.embedding import EmbeddingModel -from index.db.model import Terminology, Mapping, Concept, Variable +from index.model import Terminology, Mapping, Concept, Variable from index.parsing import MappingSource, DataDictionarySource, EmbeddingSource class MappingTable: - def __init__(self, mapping_source: MappingSource, - data_dictionary_source: DataDictionarySource = None, - embedding_source: EmbeddingSource = None, - terminology: Terminology = None): + def __init__( + self, + mapping_source: MappingSource, + data_dictionary_source: DataDictionarySource = None, + embedding_source: EmbeddingSource = None, + terminology: Terminology = None, + ): self.mapping_source: MappingSource = mapping_source self.data_dictionary_source: DataDictionarySource = data_dictionary_source self.embedding_source: EmbeddingSource = embedding_source @@ -35,17 +38,23 @@ def add_descriptions(self, data_dictionary_source: DataDictionarySource): self.data_dictionary_source = data_dictionary_source data_dictionary_df = data_dictionary_source.to_dataframe() # FIXME: Join results in duplicate entries - self.joined_mapping_table = pd.merge(self.joined_mapping_table, data_dictionary_df, - left_on="variable", - right_on="variable", - how="left").drop_duplicates() + self.joined_mapping_table = pd.merge( + self.joined_mapping_table, + data_dictionary_df, + left_on="variable", + right_on="variable", + how="left", + ).drop_duplicates() def add_embeddings(self, embedding_source: EmbeddingSource): self.embedding_source = embedding_source # FIXME: Join results in duplicate entries - self.joined_mapping_table = pd.merge(self.joined_mapping_table, embedding_source.to_dataframe(), - left_on='description', - right_on="description") + self.joined_mapping_table = pd.merge( + self.joined_mapping_table, + embedding_source.to_dataframe(), + left_on="description", + right_on="description", + ) def get_embeddings(self): if "embedding" not in self.joined_mapping_table.columns: @@ -53,32 +62,46 @@ def get_embeddings(self): if "description" not in self.joined_mapping_table.columns: raise ValueError("No descriptions found in mapping table.") else: - return self.joined_mapping_table['embedding'].apply(np.array) + return self.joined_mapping_table["embedding"].apply(np.array) def get_embeddings_numpy(self): - return np.array(self.joined_mapping_table['embedding'].dropna().tolist()) + return np.array(self.joined_mapping_table["embedding"].dropna().tolist()) def save_embeddings(self, output_path: str): self.get_embeddings().to_csv(output_path, index=False) self.embedding_source = EmbeddingSource(output_path) def compute_embeddings(self, model: EmbeddingModel): - descriptions = self.joined_mapping_table['description'].dropna().unique().tolist() + descriptions = ( + self.joined_mapping_table["description"].dropna().unique().tolist() + ) embeddings = model.get_embeddings(descriptions) - embedding_df = pd.DataFrame({'description': descriptions, 'embedding': embeddings}) - self.joined_mapping_table = pd.merge(self.joined_mapping_table, embedding_df, - left_on='description', - right_on='description', - how='left') + embedding_df = pd.DataFrame( + {"description": descriptions, "embedding": embeddings} + ) + self.joined_mapping_table = pd.merge( + self.joined_mapping_table, + embedding_df, + left_on="description", + right_on="description", + how="left", + ) def export_embeddings(self, output_path: str): - descriptions = self.joined_mapping_table['description'].dropna().unique().tolist() - embedding_df = pd.DataFrame({'description': descriptions, 'embedding': self.joined_mapping_table['embedding']}) + descriptions = ( + self.joined_mapping_table["description"].dropna().unique().tolist() + ) + embedding_df = pd.DataFrame( + { + "description": descriptions, + "embedding": self.joined_mapping_table["embedding"], + } + ) embedding_df.to_csv(output_path) def import_embeddings(self, input_path: str): embeddings = pd.read_csv(input_path) - self.joined_mapping_table['embedding'] = embeddings['embedding'] + self.joined_mapping_table["embedding"] = embeddings["embedding"] def get_mapping_table(self) -> pd.DataFrame: return self.joined_mapping_table @@ -89,14 +112,20 @@ def get_mappings(self) -> [Mapping]: concept_id = row["identifier"] variable_name = row["variable"] if self.data_dictionary_source is not None: - description = row['description'] + description = row["description"] else: description = None if not pd.isna(concept_id) and not pd.isna(variable_name): concept = Concept(concept_id, self.terminology) - variable = Variable(variable_name, description, - self.data_dictionary_source.file_path - if self.data_dictionary_source is not None else None) + variable = Variable( + variable_name, + description, + ( + self.data_dictionary_source.file_path + if self.data_dictionary_source is not None + else None + ), + ) mapping = Mapping(concept, variable, self.mapping_source.file_path) mappings.append(mapping) # remove duplicates @@ -108,14 +137,20 @@ def to_mapping_dto(self) -> [Mapping]: concept_id = row["identifier"] variable_name = row["variable"] if self.data_dictionary_source is not None: - description = row['description'] + description = row["description"] else: description = None if not pd.isna(concept_id) and not pd.isna(variable_name): concept = Concept(concept_id, self.terminology) - variable = Variable(variable_name, description, - self.data_dictionary_source.file_path - if self.data_dictionary_source is not None else None) + variable = Variable( + variable_name, + description, + ( + self.data_dictionary_source.file_path + if self.data_dictionary_source is not None + else None + ), + ) mapping = Mapping(concept, variable, self.mapping_source.file_path) mappings.append(mapping) # remove duplicates @@ -123,4 +158,4 @@ def to_mapping_dto(self) -> [Mapping]: def parse_float_array(s): - return [float(x) for x in s.strip('[]').split(',')] \ No newline at end of file + return [float(x) for x in s.strip("[]").split(",")] diff --git a/index/db/model.py b/index/model.py similarity index 81% rename from index/db/model.py rename to index/model.py index ab1608c..c30eca5 100644 --- a/index/db/model.py +++ b/index/model.py @@ -27,7 +27,9 @@ def to_dataframe(self): class Variable: - def __init__(self, name: str, description: str, source: str, embedding: Embedding = None): + def __init__( + self, name: str, description: str, source: str, embedding: Embedding = None + ): self.name = name self.description = description self.source = source @@ -42,7 +44,10 @@ def __init__(self, concept: Concept, variable: Variable, source: str): self.source = source def __eq__(self, other): - return self.concept.identifier == other.concept.identifier and self.variable.name == other.variable.name + return ( + self.concept.identifier == other.concept.identifier + and self.variable.name == other.variable.name + ) def __hash__(self): return hash((self.concept.identifier, self.variable.name)) diff --git a/index/parsing.py b/index/parsing.py index b63fbd7..1a25c39 100644 --- a/index/parsing.py +++ b/index/parsing.py @@ -5,35 +5,41 @@ class Source(ABC): - def __int__(self, file_path: str): self.file_path = file_path def to_dataframe(self) -> pd.DataFrame: # TODO: hardcoded for ad resources -> remove later if self.file_path.endswith("pharmacog.csv"): - return pd.read_csv(self.file_path, sep=' ') + return pd.read_csv(self.file_path, sep=" ") elif self.file_path.endswith("arwibo.csv"): - return pd.read_csv(self.file_path, sep=';', usecols=range(6), encoding='ISO-8859-1') + return pd.read_csv( + self.file_path, sep=";", usecols=range(6), encoding="ISO-8859-1" + ) elif self.file_path.endswith("jadni.tsv"): - return pd.read_csv(self.file_path, sep='\t', encoding='ISO-8859-1') + return pd.read_csv(self.file_path, sep="\t", encoding="ISO-8859-1") elif self.file_path.endswith("vita.csv"): - return pd.read_csv(self.file_path, sep=',', encoding_errors='ignore') + return pd.read_csv(self.file_path, sep=",", encoding_errors="ignore") elif self.file_path.endswith("wmh-ad.csv"): - pd.read_csv(self.file_path, sep=',', encoding_errors='ignore') + pd.read_csv(self.file_path, sep=",", encoding_errors="ignore") elif self.file_path.endswith(".csv"): return pd.read_csv(self.file_path) # back to general encodings elif self.file_path.endswith(".tsv"): - return pd.read_csv(self.file_path, sep='\t') + return pd.read_csv(self.file_path, sep="\t") elif self.file_path.endswith(".xlsx"): xls = pd.ExcelFile(self.file_path) - dfs = [pd.read_excel(xls, sheet_name=sheet_name) for sheet_name in xls.sheet_names] + dfs = [ + pd.read_excel(xls, sheet_name=sheet_name) + for sheet_name in xls.sheet_names + ] for df in dfs: # Replace control sequences in string columns / headers & remove trailing whitespaces - df.columns = df.columns.str.replace('\r', '', regex=True).str.strip() - string_columns = df.select_dtypes(include=['object']).columns - df[string_columns] = df[string_columns].apply(lambda x: x.str.replace('\r', '').str.strip(), axis=1) + df.columns = df.columns.str.replace("\r", "", regex=True).str.strip() + string_columns = df.select_dtypes(include=["object"]).columns + df[string_columns] = df[string_columns].apply( + lambda x: x.str.replace("\r", "").str.strip(), axis=1 + ) combined_df = pd.concat(dfs, ignore_index=True) return combined_df else: @@ -54,11 +60,20 @@ def to_dataframe(self) -> pd.DataFrame: df = super().to_dataframe() # sanity check if self.variable_field not in df.columns: - raise ValueError(f"Variable field {self.variable_field} not found in {self.file_path}") + raise ValueError( + f"Variable field {self.variable_field} not found in {self.file_path}" + ) if self.identifier_field not in df.columns: - raise ValueError(f"Identifier field {self.identifier_field} not found in {self.file_path}") + raise ValueError( + f"Identifier field {self.identifier_field} not found in {self.file_path}" + ) df = df[[self.variable_field, self.identifier_field]] - df = df.rename(columns={self.variable_field: "variable", self.identifier_field: "identifier"}) + df = df.rename( + columns={ + self.variable_field: "variable", + self.identifier_field: "identifier", + } + ) df.dropna(subset=["variable", "identifier"], inplace=True) return df @@ -77,17 +92,25 @@ def to_dataframe(self) -> pd.DataFrame: df = super().to_dataframe() # sanity check if self.variable_field not in df.columns: - raise ValueError(f"Variable field {self.variable_field} not found in {self.file_path}") + raise ValueError( + f"Variable field {self.variable_field} not found in {self.file_path}" + ) if self.description_field not in df.columns: - raise ValueError(f"Description field {self.description_field} not found in {self.file_path}") + raise ValueError( + f"Description field {self.description_field} not found in {self.file_path}" + ) df = df[[self.variable_field, self.description_field]] - df = df.rename(columns={self.variable_field: "variable", self.description_field: "description"}) + df = df.rename( + columns={ + self.variable_field: "variable", + self.description_field: "description", + } + ) df.dropna(subset=["variable", "description"], inplace=True) return df class EmbeddingSource: - def __init__(self, source_path: str): self.source_path = source_path self.description_field = "description" @@ -106,11 +129,12 @@ def export(self, dst_path: str): def parse_float_array(s): - return [float(x) for x in s.strip('[]').split(',')] + return [float(x) for x in s.strip("[]").split(",")] class ConceptSource: """ identifier -> description """ + pass diff --git a/index/visualisation.py b/index/visualisation.py index e641c15..f88d9fe 100644 --- a/index/visualisation.py +++ b/index/visualisation.py @@ -7,14 +7,15 @@ import matplotlib.pyplot as plt from sklearn.manifold import TSNE import plotly.graph_objects as go +import plotly.express as px from index.conf import COLORS_AD, COLORS_PD from index.mapping import MappingTable class PlotSide(Enum): - LEFT = 1, - RIGHT = 2, + LEFT = (1,) + RIGHT = (2,) BOTH = 3 @@ -30,25 +31,45 @@ def get_cohort_specific_color_code(cohort_name: str): elif cohort_name.lower() in COLORS_PD: return COLORS_PD[cohort_name.lower()] else: - print(f'No color code found for cohort {cohort_name}') + print(f"No color code found for cohort {cohort_name}") return None -def enrichment_plot(acc_gpt, acc_fuzzy, title, save_plot=False, save_dir="resources/results/plots"): - if len(acc_gpt) != len(acc_fuzzy): - raise ValueError("acc_gpt and acc_fuzzy should be of the same length!") - data = {"Maximum Considered Rank": list(range(1, len(acc_gpt) + 1)), "GPT": acc_gpt, - "Fuzzy": acc_fuzzy} +def enrichment_plot( + acc_gpt, + acc_mpnet, + acc_fuzzy, + title, + save_plot=False, + save_dir="resources/results/plots", +): + if ( + len(acc_gpt) != len(acc_fuzzy) + or len(acc_gpt) != len(acc_mpnet) + or len(acc_mpnet) != len(acc_fuzzy) + ): + raise ValueError( + "acc_gpt, acc_mpnet and acc_fuzzy should be of the same length!" + ) + data = { + "Maximum Considered Rank": list(range(1, len(acc_gpt) + 1)), + "GPT": acc_gpt, + "MPNet": acc_mpnet, + "Fuzzy": acc_fuzzy, + } df = pd.DataFrame(data) sns.set(style="whitegrid") sns.lineplot(data=df, x="Maximum Considered Rank", y="GPT", label="GPT") - sns.lineplot(data=df, x="Maximum Considered Rank", y="Fuzzy", label="Fuzzy String Matching") + sns.lineplot(data=df, x="Maximum Considered Rank", y="MPNet", label="MPNet") + sns.lineplot( + data=df, x="Maximum Considered Rank", y="Fuzzy", label="Fuzzy String Matching" + ) sns.set(style="whitegrid") plt.xlabel("Maximum Considered Rank") plt.ylabel("Accuracy") plt.xticks(range(1, len(acc_gpt) + 1), labels=range(1, len(acc_gpt) + 1)) plt.yticks([i / 10 for i in range(11)]) - plt.gca().set_yticklabels([f'{i:.1f}' for i in plt.gca().get_yticks()]) + plt.gca().set_yticklabels([f"{i:.1f}" for i in plt.gca().get_yticks()]) plt.title(title) plt.legend() if save_plot: @@ -61,71 +82,222 @@ def concat_embeddings(tables1: [MappingTable], tables2: [MappingTable]): tables1_cleaned = [copy.deepcopy(table) for table in tables1] tables2_cleaned = [copy.deepcopy(table) for table in tables2] for table1, table2 in zip(tables1_cleaned, tables2_cleaned): - table1.joined_mapping_table.dropna(subset=['embedding', 'description'], inplace=True) - table2.joined_mapping_table.dropna(subset=['embedding', 'description'], inplace=True) - vectors_tables1 = np.concatenate([table.get_embeddings_numpy() for table in tables1_cleaned]) - vectors_tables2 = np.concatenate([table.get_embeddings_numpy() for table in tables2_cleaned]) - descriptions_table1 = np.concatenate([table.joined_mapping_table["description"] for table in tables1_cleaned]) - descriptions_table2 = np.concatenate([table.joined_mapping_table["description"] for table in tables2_cleaned]) - boundaries1 = np.array([table.joined_mapping_table["embedding"].index.size for table in tables1_cleaned]) - boundaries2 = np.array([table.joined_mapping_table["embedding"].index.size for table in tables2_cleaned]) + table1.joined_mapping_table.dropna( + subset=["embedding", "description"], inplace=True + ) + table2.joined_mapping_table.dropna( + subset=["embedding", "description"], inplace=True + ) + vectors_tables1 = np.concatenate( + [table.get_embeddings_numpy() for table in tables1_cleaned] + ) + vectors_tables2 = np.concatenate( + [table.get_embeddings_numpy() for table in tables2_cleaned] + ) + descriptions_table1 = np.concatenate( + [table.joined_mapping_table["description"] for table in tables1_cleaned] + ) + descriptions_table2 = np.concatenate( + [table.joined_mapping_table["description"] for table in tables2_cleaned] + ) + boundaries1 = np.array( + [ + table.joined_mapping_table["embedding"].index.size + for table in tables1_cleaned + ] + ) + boundaries2 = np.array( + [ + table.joined_mapping_table["embedding"].index.size + for table in tables2_cleaned + ] + ) vectors_concatenated = np.concatenate([vectors_tables1, vectors_tables2]) - descriptions_concatenated = np.concatenate([descriptions_table1, descriptions_table2]) - boundaries_concatenated = size_array_to_boundaries(np.concatenate([boundaries1, boundaries2])) + descriptions_concatenated = np.concatenate( + [descriptions_table1, descriptions_table2] + ) + boundaries_concatenated = size_array_to_boundaries( + np.concatenate([boundaries1, boundaries2]) + ) return vectors_concatenated, descriptions_concatenated, boundaries_concatenated -def scatter_plot_two_distributions(tables1: [MappingTable], tables2: [MappingTable], label1: str, label2: str, - store_html: bool = True, - store_destination: str = "resources/results/plots/ad_vs_pd.html"): - vectors_tables1 = np.concatenate([table.get_embeddings_numpy() for table in tables1]) - vectors_tables2 = np.concatenate([table.get_embeddings_numpy() for table in tables2]) +def bar_chart_average_acc_two_distributions( + dist1_fuzzy: pd.DataFrame, + dist1_gpt: pd.DataFrame, + dist1_mpnet: pd.DataFrame, + dist2_fuzzy: pd.DataFrame, + dist2_gpt: pd.DataFrame, + dist2_mpnet: pd.DataFrame, + title: str, + label1: str, + label2: str, +): + if not all( + dist.shape == fuzzy.shape == mpnet.shape + for dist, mpnet, fuzzy in [ + (dist1_gpt, dist1_mpnet, dist1_fuzzy), + (dist2_gpt, dist2_mpnet, dist2_fuzzy), + ] + ): + raise ValueError( + "Each pair of dist and fuzzy DataFrames must have the same dimensions" + ) + if not all(dist.shape[0] == dist.shape[1] for dist in [dist1_fuzzy, dist2_fuzzy]): + raise ValueError("Each dist DataFrame must be square") + if not all( + dist.index.equals(fuzzy.index) and dist.columns.equals(fuzzy.columns) + for dist, fuzzy in [(dist1_fuzzy, dist1_gpt), (dist2_fuzzy, dist2_gpt)] + ): + raise ValueError( + "All row and column labels within each pair of dist and fuzzy DataFrames must be equal" + ) + # average value without the diagonal, since diagonal contains matching of the same pair + avg_acc_fuzzy1 = np.mean( + dist1_fuzzy.values[~np.eye(dist1_fuzzy.shape[0], dtype=bool)] + ) + avg_acc_fuzzy2 = np.mean( + dist2_fuzzy.values[~np.eye(dist2_fuzzy.shape[0], dtype=bool)] + ) + avg_acc_gpt1 = np.mean(dist1_gpt.values[~np.eye(dist1_gpt.shape[0], dtype=bool)]) + avg_acc_gpt2 = np.mean(dist2_gpt.values[~np.eye(dist2_gpt.shape[0], dtype=bool)]) + avg_acc_mpnet1 = np.mean( + dist1_mpnet.values[~np.eye(dist1_mpnet.shape[0], dtype=bool)] + ) + avg_acc_mpnet2 = np.mean( + dist2_mpnet.values[~np.eye(dist2_mpnet.shape[0], dtype=bool)] + ) + data = { + "Fuzzy String Matching": [avg_acc_fuzzy1, avg_acc_fuzzy2], + "GPT Embeddings": [avg_acc_gpt1, avg_acc_gpt2], + "MPNet Embeddings": [avg_acc_mpnet1, avg_acc_mpnet2], + } + df = pd.DataFrame(data, index=[label1, label2]) + print(df) + df_melted = df.reset_index().melt( + id_vars="index", var_name="Method", value_name="Accuracy" + ) + plt.figure(figsize=(10, 6)) + sns.set(style="whitegrid") + sns.barplot(x="index", y="Accuracy", hue="Method", data=df_melted) + plt.xlabel("") + plt.ylabel("Average Accuracy") + plt.title(title) + plt.show() + + +def scatter_plot_two_distributions( + tables1: [MappingTable], + tables2: [MappingTable], + label1: str, + label2: str, + store_html: bool = True, + legend_font_size: int = 16, + store_destination: str = "resources/results/plots/ad_vs_pd.html", +): + vectors_tables1 = np.concatenate( + [table.get_embeddings_numpy() for table in tables1] + ) + vectors_tables2 = np.concatenate( + [table.get_embeddings_numpy() for table in tables2] + ) # remove entries that do not contain an embedding -> have no corresponding vector - [table.joined_mapping_table.dropna(subset=['embedding'], inplace=True) for table in tables1] - [table.joined_mapping_table.dropna(subset=['embedding'], inplace=True) for table in tables2] + [ + table.joined_mapping_table.dropna(subset=["embedding"], inplace=True) + for table in tables1 + ] + [ + table.joined_mapping_table.dropna(subset=["embedding"], inplace=True) + for table in tables2 + ] # get descriptions as interactive labels - labels_table1 = np.concatenate([table.joined_mapping_table["description"] for table in tables1]) - labels_table2 = np.concatenate([table.joined_mapping_table["description"] for table in tables2]) + labels_table1 = np.concatenate( + [table.joined_mapping_table["description"] for table in tables1] + ) + labels_table2 = np.concatenate( + [table.joined_mapping_table["description"] for table in tables2] + ) # boundary for concatenated vector class_boundary = len(vectors_tables1) vectors_concatenated = np.concatenate([vectors_tables1, vectors_tables2]) tsne = TSNE(n_components=2, perplexity=30, random_state=42) tsne_result = tsne.fit_transform(vectors_concatenated) fig = go.Figure() - fig.add_trace(go.Scatter(x=tsne_result[:class_boundary, 0], y=tsne_result[:class_boundary, 1], - mode="markers", name=label1, text=labels_table1)) - fig.add_trace(go.Scatter(x=tsne_result[class_boundary:, 0], y=tsne_result[class_boundary:, 1], - mode="markers", name=label2, text=labels_table2)) + # bigger legend size + fig.update_layout(legend=dict(font=dict(size=legend_font_size))) + fig.add_trace( + go.Scatter( + x=tsne_result[:class_boundary, 0], + y=tsne_result[:class_boundary, 1], + mode="markers", + name=label1, + text=labels_table1, + ) + ) + fig.add_trace( + go.Scatter( + x=tsne_result[class_boundary:, 0], + y=tsne_result[class_boundary:, 1], + mode="markers", + name=label2, + text=labels_table2, + ) + ) fig.show() if store_html: fig.write_html(store_destination) -def scatter_plot_all_cohorts(tables1: [MappingTable], tables2: [MappingTable], labels1: [str], labels2: [str], - plot_side: PlotSide = PlotSide.BOTH, store_html: bool = True, - store_base_dir: str = "resources/results/plots"): +def scatter_plot_all_cohorts( + tables1: [MappingTable], + tables2: [MappingTable], + labels1: [str], + labels2: [str], + plot_side: PlotSide = PlotSide.BOTH, + store_html: bool = True, + legend_font_size: int = 16, + store_base_dir: str = "resources/results/plots", +): if not len(tables1) == len(labels1) or not len(tables2) == len(labels2): raise ValueError("Length of corresponding tables and labels must be equal!") tables_boundary = len(tables1) vectors, descriptions, boundaries = concat_embeddings(tables1, tables2) - tsne = TSNE(n_components=2, perplexity=(30 if len(vectors) > 30 else len(vectors) - 1), random_state=42) + tsne = TSNE( + n_components=2, + perplexity=(30 if len(vectors) > 30 else len(vectors) - 1), + random_state=42, + ) tsne_result = tsne.fit_transform(vectors) + # more distinct colors + color_scale = px.colors.qualitative.Set3 fig = go.Figure() + # bigger legend size + fig.update_layout(legend=dict(font=dict(size=legend_font_size))) # first cohort is from 0 to x boundaries = np.insert(boundaries, 0, 0) for idx in range(len(tables1)): if labels1[idx]: - fig.add_trace(go.Scatter(x=tsne_result[boundaries[idx]:boundaries[idx + 1], 0], - y=tsne_result[boundaries[idx]:boundaries[idx + 1], 1], - mode="markers", name=labels1[idx], - text=descriptions[boundaries[idx]:boundaries[idx + 1]], - line=dict(color=get_cohort_specific_color_code(labels1[idx])))) + fig.add_trace( + go.Scatter( + x=tsne_result[boundaries[idx] : boundaries[idx + 1], 0], + y=tsne_result[boundaries[idx] : boundaries[idx + 1], 1], + mode="markers", + name=labels1[idx], + text=descriptions[boundaries[idx] : boundaries[idx + 1]], + # line=dict(color=get_cohort_specific_color_code(labels1[idx])) + ) + ) for idy in range(len(tables1), len(boundaries) - 1): - fig.add_trace(go.Scatter(x=tsne_result[boundaries[idy]:boundaries[idy + 1], 0], - y=tsne_result[boundaries[idy]:boundaries[idy + 1], 1], - mode="markers", name=labels2[idy - len(tables1)], - text=descriptions[boundaries[idy]:boundaries[idy + 1]], - line=dict(color=get_cohort_specific_color_code(labels2[idy - len(tables1)])))) + fig.add_trace( + go.Scatter( + x=tsne_result[boundaries[idy] : boundaries[idy + 1], 0], + y=tsne_result[boundaries[idy] : boundaries[idy + 1], 1], + mode="markers", + name=labels2[idy - len(tables1)], + text=descriptions[boundaries[idy] : boundaries[idy + 1]], + # line=dict(color=get_cohort_specific_color_code(labels2[idy - len(tables1)])) + ) + ) if store_html: fig.write_html(store_base_dir + "/tsne_all_cohorts.html") fig.show() diff --git a/requirements.txt b/requirements.txt index 7b21b4c..0610d8e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,18 @@ +matplotlib~=3.8.1 numpy==1.25.2 +openai~=0.28.0 +openpyxl pandas==2.1.0 pip==21.3.1 +plotly~=5.17.0 python-dateutil==2.8.2 +python-dotenv~=1.0.0 pytz==2023.3 +seaborn~=0.13.0 +sentence-transformers==2.3.1 setuptools==60.2.0 +scikit-learn==1.3.2 six==1.16.0 -tzdata==2023.3 -wheel==0.37.1 -openpyxl -openai~=0.28.0 -scikit-learn~=1.3.0 -plotly~=5.17.0 -python-dotenv~=1.0.0 thefuzz~=0.20.0 -matplotlib~=3.8.1 -seaborn~=0.13.0 \ No newline at end of file +tzdata==2023.3 +wheel==0.37.1 \ No newline at end of file diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index 143dbf4..1f0a03e 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -3,7 +3,12 @@ import numpy as np -from index.evaluation import match_closest_descriptions, MatchingMethod, enrichment_analysis, score_mappings +from index.evaluation import ( + match_closest_descriptions, + MatchingMethod, + enrichment_analysis, + score_mappings, +) from index.mapping import MappingTable from index.parsing import MappingSource, DataDictionarySource @@ -12,19 +17,46 @@ class Test(TestCase): TEST_DIR_PATH = os.path.dirname(os.path.realpath(__file__)) - mapping_source = MappingSource(os.path.join(TEST_DIR_PATH, "resources", 'test_mapping.xlsx'), "VAR_1", "ID_1") - data_dictionary_source = DataDictionarySource(os.path.join(TEST_DIR_PATH, "resources", 'test_data_dict.csv'), - "VAR_1", "DESC") + mapping_source = MappingSource( + os.path.join(TEST_DIR_PATH, "resources", "test_mapping.xlsx"), "VAR_1", "ID_1" + ) + data_dictionary_source = DataDictionarySource( + os.path.join(TEST_DIR_PATH, "resources", "test_data_dict.csv"), "VAR_1", "DESC" + ) def test_match_closest_descriptions_embeddings(self): mapping_table1 = MappingTable(self.mapping_source, self.data_dictionary_source) mapping_table2 = MappingTable(self.mapping_source, self.data_dictionary_source) # make the second mapping table shorter to test the case where there are more descriptions in the first - mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[:-2] - embeddings1 = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], np.nan, np.nan, [8, 8], [9, 9], np.nan] - embeddings2 = [[0, 0], np.nan, [9, 9], [3, 3], [7, 7], [5.1, 5.1], [5, 5], [4, 4], np.nan] - mapping_table1.joined_mapping_table['embedding'] = embeddings1 - mapping_table2.joined_mapping_table['embedding'] = embeddings2 + mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[ + :-2 + ] + embeddings1 = [ + [0, 0], + [1, 1], + [2, 2], + [3, 3], + [4, 4], + [5, 5], + np.nan, + np.nan, + [8, 8], + [9, 9], + np.nan, + ] + embeddings2 = [ + [0, 0], + np.nan, + [9, 9], + [3, 3], + [7, 7], + [5.1, 5.1], + [5, 5], + [4, 4], + np.nan, + ] + mapping_table1.joined_mapping_table["embedding"] = embeddings1 + mapping_table2.joined_mapping_table["embedding"] = embeddings2 result = match_closest_descriptions(mapping_table1, mapping_table2) self.assertEqual(3, result["correct"].sum()) @@ -32,34 +64,87 @@ def test_score_mappings(self): mapping_table1 = MappingTable(self.mapping_source, self.data_dictionary_source) mapping_table2 = MappingTable(self.mapping_source, self.data_dictionary_source) # make the second mapping table shorter to test the case where there are more descriptions in the first - mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[:-2] - embeddings1 = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], np.nan, np.nan, [8, 8], [9, 9], np.nan] - embeddings2 = [[0, 0], np.nan, [9, 9], [3, 3], [7, 7], [5.1, 5.1], [5, 5], [4, 4], np.nan] - mapping_table1.joined_mapping_table['embedding'] = embeddings1 - mapping_table2.joined_mapping_table['embedding'] = embeddings2 + mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[ + :-2 + ] + embeddings1 = [ + [0, 0], + [1, 1], + [2, 2], + [3, 3], + [4, 4], + [5, 5], + np.nan, + np.nan, + [8, 8], + [9, 9], + np.nan, + ] + embeddings2 = [ + [0, 0], + np.nan, + [9, 9], + [3, 3], + [7, 7], + [5.1, 5.1], + [5, 5], + [4, 4], + np.nan, + ] + mapping_table1.joined_mapping_table["embedding"] = embeddings1 + mapping_table2.joined_mapping_table["embedding"] = embeddings2 # 2 should be correct out of a total of 4 valid mappings (possible matches, no nan) result = match_closest_descriptions(mapping_table1, mapping_table2) acc = score_mappings(result) - self.assertEqual(3/5, acc) + self.assertEqual(3 / 5, acc) def test_match_closest_description_fuzzy(self): mapping_table1 = MappingTable(self.mapping_source, self.data_dictionary_source) mapping_table2 = MappingTable(self.mapping_source, self.data_dictionary_source) # make the second mapping table shorter to test the case where there are more descriptions in the first - mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[:-2] - result = match_closest_descriptions(mapping_table1, mapping_table2, - matching_method=MatchingMethod.FUZZY_STRING_MATCHING) + mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[ + :-2 + ] + result = match_closest_descriptions( + mapping_table1, + mapping_table2, + matching_method=MatchingMethod.FUZZY_STRING_MATCHING, + ) self.assertEqual(7, result["correct"].sum()) def test_enrichment_analysis_embeddings(self): mapping_table1 = MappingTable(self.mapping_source, self.data_dictionary_source) mapping_table2 = MappingTable(self.mapping_source, self.data_dictionary_source) # make the second mapping table shorter to test the case where there are more descriptions in the first - mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[:-2] - embeddings1 = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], np.nan, np.nan, [8, 8], [9, 9], np.nan] - embeddings2 = [[0, 0], np.nan, [9, 9], [3, 3], [7, 7], [5.1, 5.1], [5, 5], [4, 4], np.nan] - mapping_table1.joined_mapping_table['embedding'] = embeddings1 - mapping_table2.joined_mapping_table['embedding'] = embeddings2 + mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[ + :-2 + ] + embeddings1 = [ + [0, 0], + [1, 1], + [2, 2], + [3, 3], + [4, 4], + [5, 5], + np.nan, + np.nan, + [8, 8], + [9, 9], + np.nan, + ] + embeddings2 = [ + [0, 0], + np.nan, + [9, 9], + [3, 3], + [7, 7], + [5.1, 5.1], + [5, 5], + [4, 4], + np.nan, + ] + mapping_table1.joined_mapping_table["embedding"] = embeddings1 + mapping_table2.joined_mapping_table["embedding"] = embeddings2 result = enrichment_analysis(mapping_table1, mapping_table2, 5) self.assertListEqual([3 / 5, 3 / 5, 4 / 5, 4 / 5, 1], result.tolist()) @@ -67,7 +152,13 @@ def test_enrichment_analysis_fuzzy(self): mapping_table1 = MappingTable(self.mapping_source, self.data_dictionary_source) mapping_table2 = MappingTable(self.mapping_source, self.data_dictionary_source) # make the second mapping table shorter to test the case where there are more descriptions in the first - mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[:-2] - result = enrichment_analysis(mapping_table1, mapping_table2, 5, - matching_method=MatchingMethod.FUZZY_STRING_MATCHING) + mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[ + :-2 + ] + result = enrichment_analysis( + mapping_table1, + mapping_table2, + 5, + matching_method=MatchingMethod.FUZZY_STRING_MATCHING, + ) self.assertListEqual([1, 1, 1, 1, 1], result.tolist()) diff --git a/tests/test_parser.py b/tests/test_parser.py index 14555a1..417e850 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -8,9 +8,12 @@ class Test(TestCase): TEST_DIR_PATH = os.path.dirname(os.path.realpath(__file__)) - mapping_source = MappingSource(os.path.join(TEST_DIR_PATH, "resources", 'test_mapping.xlsx'), "VAR_1", "ID_1") - data_dictionary_source = DataDictionarySource(os.path.join(TEST_DIR_PATH, "resources", 'test_data_dict.csv'), - "VAR_1", "DESC") + mapping_source = MappingSource( + os.path.join(TEST_DIR_PATH, "resources", "test_mapping.xlsx"), "VAR_1", "ID_1" + ) + data_dictionary_source = DataDictionarySource( + os.path.join(TEST_DIR_PATH, "resources", "test_data_dict.csv"), "VAR_1", "DESC" + ) def test_parse(self): mapping_table = MappingTable(self.mapping_source, self.data_dictionary_source) @@ -34,8 +37,14 @@ def test_parse_add_description_later(self): def test_parse_data_dict_excel(self): mapping_table = MappingTable(self.mapping_source) data_dictionary_source = DataDictionarySource( - os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources", 'test_data_dict.xlsx'), - "VAR_1", "DESC") + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "resources", + "test_data_dict.xlsx", + ), + "VAR_1", + "DESC", + ) mapping_table.add_descriptions(data_dictionary_source) mappings = mapping_table.get_mappings() self.assertEqual(11, len(mappings)) diff --git a/tests/test_visualisation.py b/tests/test_visualisation.py index 0c74b8c..b3bd61b 100644 --- a/tests/test_visualisation.py +++ b/tests/test_visualisation.py @@ -2,18 +2,28 @@ from unittest import TestCase import numpy as np +import pandas as pd +from index.evaluation import evaluate from index.mapping import MappingTable from index.parsing import MappingSource, DataDictionarySource -from index.visualisation import scatter_plot_two_distributions, enrichment_plot, scatter_plot_all_cohorts +from index.visualisation import ( + scatter_plot_two_distributions, + enrichment_plot, + scatter_plot_all_cohorts, + bar_chart_average_acc_two_distributions, +) class Test(TestCase): TEST_DIR_PATH = os.path.dirname(os.path.realpath(__file__)) - mapping_source = MappingSource(os.path.join(TEST_DIR_PATH, "resources", 'test_mapping.xlsx'), "VAR_1", "ID_1") - data_dictionary_source = DataDictionarySource(os.path.join(TEST_DIR_PATH, "resources", 'test_data_dict.csv'), - "VAR_1", "DESC") + mapping_source = MappingSource( + os.path.join(TEST_DIR_PATH, "resources", "test_mapping.xlsx"), "VAR_1", "ID_1" + ) + data_dictionary_source = DataDictionarySource( + os.path.join(TEST_DIR_PATH, "resources", "test_data_dict.csv"), "VAR_1", "DESC" + ) embeddings1 = [ [1.1, 2.2, 3.3], @@ -26,7 +36,7 @@ class Test(TestCase): [22.4, 23.5, 24.6], [25.7, 26.8, 27.9], [28.1, 29.2, 30.3], - [31.4, 32.5, 33.6] + [31.4, 32.5, 33.6], ] embeddings2 = [ @@ -40,7 +50,7 @@ class Test(TestCase): [23.4, 24.5, 25.6], [26.7, 27.8, 28.9], np.nan, - [32.4, 33.5, 34.6] + [32.4, 33.5, 34.6], ] embeddings3 = [ @@ -54,7 +64,7 @@ class Test(TestCase): [24.4, 25.5, 26.6], [27.7, 28.8, 29.9], [30.1, 31.2, 32.3], - [33.4, 34.5, 35.6] + [33.4, 34.5, 35.6], ] embeddings4 = [ @@ -68,7 +78,7 @@ class Test(TestCase): [25.4, 26.5, 27.6], [28.7, 29.8, 30.9], [31.1, 32.2, 33.3], - np.nan + np.nan, ] def test_scatter_plot_two_distributions(self): @@ -81,12 +91,17 @@ def test_scatter_plot_two_distributions(self): mapping_table4 = MappingTable(self.mapping_source) mapping_table4.add_descriptions(self.data_dictionary_source) - mapping_table1.joined_mapping_table['embedding'] = self.embeddings1 - mapping_table2.joined_mapping_table['embedding'] = self.embeddings2 - mapping_table3.joined_mapping_table['embedding'] = self.embeddings3 - mapping_table4.joined_mapping_table['embedding'] = self.embeddings4 - scatter_plot_two_distributions([mapping_table1, mapping_table2], [mapping_table3, mapping_table4], "A", "B", - store_html=False) + mapping_table1.joined_mapping_table["embedding"] = self.embeddings1 + mapping_table2.joined_mapping_table["embedding"] = self.embeddings2 + mapping_table3.joined_mapping_table["embedding"] = self.embeddings3 + mapping_table4.joined_mapping_table["embedding"] = self.embeddings4 + scatter_plot_two_distributions( + [mapping_table1, mapping_table2], + [mapping_table3, mapping_table4], + "A", + "B", + store_html=False, + ) def test_scatter_plot_all_cohorts(self): mapping_table1 = MappingTable(self.mapping_source) @@ -98,15 +113,52 @@ def test_scatter_plot_all_cohorts(self): mapping_table4 = MappingTable(self.mapping_source) mapping_table4.add_descriptions(self.data_dictionary_source) - mapping_table1.joined_mapping_table['embedding'] = self.embeddings1 - mapping_table2.joined_mapping_table['embedding'] = self.embeddings2 - mapping_table3.joined_mapping_table['embedding'] = self.embeddings3 - mapping_table4.joined_mapping_table['embedding'] = self.embeddings4 - scatter_plot_all_cohorts([mapping_table1, mapping_table2], [mapping_table3, mapping_table4], - ["A1", "A2"], ["B1", "B2"], store_html=False) + mapping_table1.joined_mapping_table["embedding"] = self.embeddings1 + mapping_table2.joined_mapping_table["embedding"] = self.embeddings2 + mapping_table3.joined_mapping_table["embedding"] = self.embeddings3 + mapping_table4.joined_mapping_table["embedding"] = self.embeddings4 + scatter_plot_all_cohorts( + [mapping_table1, mapping_table2], + [mapping_table3, mapping_table4], + ["A1", "A2"], + ["B1", "B2"], + store_html=False, + ) def test_enrichment_plot(self): acc_gpt = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.0, 1.0, 1.0] + acc_mpnet = [0.3, 0.4, 0.5, 0.6, 0.8, 0.9, 1.0, 1.0, 1.0, 1.0] acc_fuzzy = [0.2, 0.3, 0.4, 0.5, 0.8, 0.9, 1.0, 1.0, 1.0, 1.0] title = "Test" - enrichment_plot(acc_gpt, acc_fuzzy, title, save_plot=False) + enrichment_plot(acc_gpt, acc_mpnet, acc_fuzzy, title, save_plot=False) + + def test_bar_chart_average_acc_two_distributions(self): + labels = ["M1", "M2", "M3"] + fuzzy_1 = pd.DataFrame( + {"M1": [1, 0.2, 0.23], "M2": [0.3, 1, 0.16], "M3": [0.27, 0.22, 1]}, + index=labels, + ).T + fuzzy_2 = pd.DataFrame( + {"M1": [1, 0.19, 0.21], "M2": [0.29, 1, 0.18], "M3": [0.29, 0.21, 1]}, + index=labels, + ).T + gpt_1 = pd.DataFrame( + {"M1": [1, 0.9, 0.78], "M2": [0.8, 1, 0.78], "M3": [0.82, 0.89, 1]}, + index=labels, + ).T + gpt_2 = pd.DataFrame( + {"M1": [1, 0.88, 0.78], "M2": [0.79, 1, 0.78], "M3": [0.81, 0.85, 1]}, + index=labels, + ).T + mpnet_1 = pd.DataFrame( + {"M1": [1, 0.8, 0.7], "M2": [0.7, 0.9, 0.68], "M3": [0.72, 0.79, 0.9]}, + index=labels, + ).T + mpnet_2 = pd.DataFrame( + {"M1": [0.9, 0.78, 0.68], "M2": [0.69, 0.9, 0.68], "M3": [0.71, 0.75, 0.9]}, + index=labels, + ).T + + bar_chart_average_acc_two_distributions( + fuzzy_1, gpt_1, mpnet_1, fuzzy_2, gpt_2, mpnet_2, "title", "AD", "PD" + ) From 041c4caccb6e7bb13b714cb51f9c2c9ea06075db Mon Sep 17 00:00:00 2001 From: Mehmet Can Ay Date: Wed, 14 Feb 2024 16:44:12 +0100 Subject: [PATCH 2/3] fix: formatting --- index/embedding.py | 4 +- index/evaluation.py | 126 ++++++++++------------------------------- index/main.py | 20 ++++++- index/mapping.py | 2 +- index/model.py | 56 ------------------ index/visualisation.py | 56 +++++------------- 6 files changed, 63 insertions(+), 201 deletions(-) delete mode 100644 index/model.py diff --git a/index/embedding.py b/index/embedding.py index be42cc3..c7b4639 100644 --- a/index/embedding.py +++ b/index/embedding.py @@ -27,9 +27,7 @@ def get_embedding(self, text: str, model="text-embedding-ada-002"): return None if isinstance(text, str): text = text.replace("\n", " ") - return openai.Embedding.create(input=[text], model=model)["data"][0][ - "embedding" - ] + return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"] except Exception as e: logging.error(f"Error getting embedding for {text}: {e}") return None diff --git a/index/evaluation.py b/index/evaluation.py index 8c9160b..b5c6925 100644 --- a/index/evaluation.py +++ b/index/evaluation.py @@ -36,26 +36,16 @@ def enrichment_analysis( # not every variable can be matched max_matches = 0 # clean up source and target table (missing embeddings, descriptions etc.) - source_table.joined_mapping_table.drop_duplicates( - subset=["variable"], keep="first", inplace=True - ) + source_table.joined_mapping_table.drop_duplicates(subset=["variable"], keep="first", inplace=True) source_table.joined_mapping_table.dropna(subset=["description"], inplace=True) target_table.joined_mapping_table.dropna(subset=["description"], inplace=True) - if ( - matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE - or matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE - ): + if (matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE or matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE): source_table.joined_mapping_table.dropna(subset=["embedding"], inplace=True) target_table.joined_mapping_table.dropna(subset=["embedding"], inplace=True) # re-index to account for dropped rows - target_table.joined_mapping_table = target_table.joined_mapping_table.reset_index( - drop=True - ) + target_table.joined_mapping_table = target_table.joined_mapping_table.reset_index(drop=True) for idx, source_table_row in source_table.joined_mapping_table.iterrows(): - correct_target_index = target_table.joined_mapping_table[ - target_table.joined_mapping_table["identifier"] - == source_table_row["identifier"] - ].index + correct_target_index = target_table.joined_mapping_table[target_table.joined_mapping_table["identifier"] == source_table_row["identifier"]].index if len(correct_target_index) == 0: # can not be matched -> skip continue @@ -67,37 +57,21 @@ def enrichment_analysis( if matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE: source_table_embedding = source_table_row["embedding"] target_table_embedding = target_table_row["embedding"] - distances.append( - np.linalg.norm( - np.array(source_table_embedding) - - np.array(target_table_embedding) - ) - ) + distances.append(np.linalg.norm(np.array(source_table_embedding) - np.array(target_table_embedding))) elif matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE: source_table_embedding = np.array(source_table_row["embedding"]) target_table_embedding = np.array(target_table_row["embedding"]) - distances.append( - distance.cosine(source_table_embedding, target_table_embedding) - ) + distances.append(distance.cosine(source_table_embedding, target_table_embedding)) elif matching_method == MatchingMethod.FUZZY_STRING_MATCHING: source_table_description = source_table_row["description"] target_table_description = target_table_row["description"] - distances.append( - 100 - fuzz.ratio(source_table_description, target_table_description) - ) + distances.append(100 - fuzz.ratio(source_table_description, target_table_description)) else: - raise NotImplementedError( - "Specified matching method is not implemented!" - ) - min_distance_indices = np.argsort(np.array(distances))[ - :max_cumulative_match_rank - ] + raise NotImplementedError("Specified matching method is not implemented!") + min_distance_indices = np.argsort(np.array(distances))[:max_cumulative_match_rank] for n in range(max_cumulative_match_rank): # (due to upper level concepts) there may be more than one correct mapping - if any( - element in min_distance_indices[: n + 1] - for element in correct_target_index - ): + if any(element in min_distance_indices[: n + 1] for element in correct_target_index): correct_matches[n] += 1 return (correct_matches / max_matches).round(2) @@ -118,9 +92,7 @@ def match_closest_descriptions( """ # sometimes the same concept gets mapped against multiple concepts in CDM, resulting in artifacts in the results # -> drop duplicates, only keep first - source_table.joined_mapping_table.drop_duplicates( - subset=["variable"], keep="first", inplace=True - ) + source_table.joined_mapping_table.drop_duplicates(subset=["variable"], keep="first", inplace=True) # remove rows from source and target that do not contain either a description (in general) or embedding (for gpt) source_table.joined_mapping_table.dropna(subset=["description"], inplace=True) target_table.joined_mapping_table.dropna(subset=["description"], inplace=True) @@ -280,60 +252,22 @@ def evaluate( model="gpt", results_root_dir="resources/results/pd", ): - - if model == "gpt": - data_gpt = {} - data_fuzzy = {} - for idx, source in enumerate(datasets): - acc_gpt = [] - acc_fuzzy = [] - for idy, target in enumerate(datasets): - map_gpt = match_closest_descriptions(source, target) - map_fuzzy = match_closest_descriptions( - source, target, matching_method=MatchingMethod.FUZZY_STRING_MATCHING - ) - if target == "jadni": - print("check") - if store_results: - map_gpt.to_excel( - results_root_dir - + "/gpt_" - + f"{labels[idx]}_to_{labels[idy]}.xlsx" - ) - map_fuzzy.to_excel( - results_root_dir - + "/fuzzy_" - + f"{labels[idx]}_to_{labels[idy]}.xlsx" - ) - acc_gpt.append(round(score_mappings(map_gpt), 2)) - acc_fuzzy.append(round(score_mappings(map_fuzzy), 2)) - data_gpt[labels[idx]] = acc_gpt - data_fuzzy[labels[idx]] = acc_fuzzy - # transpose to have from -> to | row -> column like in the paper - gpt = pd.DataFrame(data_gpt, index=labels).T - fuzzy = pd.DataFrame(data_fuzzy, index=labels).T - return gpt, fuzzy - - elif model == "mpnet": - data_mpnet = {} - for idx, source in enumerate(datasets): - acc_mpnet = [] - for idy, target in enumerate(datasets): - map_mpnet = match_closest_descriptions( - source, - target, - matching_method=MatchingMethod.COSINE_EMBEDDING_DISTANCE, - ) - if target == "jadni": - print("check") - if store_results: - map_mpnet.to_excel( - results_root_dir - + "/mpnet_" - + f"{labels[idx]}_to_{labels[idy]}.xlsx" - ) - acc_mpnet.append(round(score_mappings(map_mpnet), 2)) - data_mpnet[labels[idx]] = acc_mpnet - # transpose to have from -> to | row -> column like in the paper - mpnet = pd.DataFrame(data_mpnet, index=labels).T - return mpnet + data = {} + for idx, source in enumerate(datasets): + acc = [] + for idy, target in enumerate(datasets): + if model == "gpt": + map = match_closest_descriptions(source, target) + elif model == "mpnet": + map = match_closest_descriptions(source,target, matching_method=MatchingMethod.COSINE_EMBEDDING_DISTANCE) + elif model == "fuzzy": + map = match_closest_descriptions(source, target, matching_method=MatchingMethod.FUZZY_STRING_MATCHING) + else: + raise NotImplementedError("Specified model is not implemented!") + if store_results: + map.to_excel(results_root_dir + f"/{model}_" + f"{labels[idx]}_to_{labels[idy]}.xlsx") + acc.append(round(score_mappings(map), 2)) + data[labels[idx]] = acc + # transpose to have from -> to | row -> column like in the paper + model_output = pd.DataFrame(data, index=labels).T + return model_output diff --git a/index/main.py b/index/main.py index 08dbe96..d36eb54 100644 --- a/index/main.py +++ b/index/main.py @@ -236,7 +236,7 @@ print(ppmi_luxpark_enrichment_mpnet) print(ppmi_luxpark_enrichment_fuzzy) - gpt_table1, fuzzy_table1 = evaluate( + gpt_table1 = evaluate( pd_datasets_gpt, pd_datasets_labels, store_results=True, @@ -244,6 +244,14 @@ results_root_dir="./resources/results/pd", ) + fuzzy_table1 = evaluate( + pd_datasets_gpt, + pd_datasets_labels, + store_results=True, + model="fuzzy", + results_root_dir="./resources/results/pd", + ) + mpnet_table1 = evaluate( pd_datasets_mpnet, pd_datasets_labels, @@ -568,7 +576,7 @@ "VITA", "AD-Mapper", ] - gpt_table2, fuzzy_table2 = evaluate( + gpt_table2 = evaluate( ad_datasets_gpt, ad_datasets_labels, store_results=True, @@ -576,6 +584,14 @@ results_root_dir="resources/results/ad", ) + fuzzy_table2 = evaluate( + ad_datasets_gpt, + ad_datasets_labels, + store_results=True, + model="fuzzy", + results_root_dir="resources/results/ad", + ) + mpnet_table2 = evaluate( ad_datasets_mpnet, ad_datasets_labels, diff --git a/index/mapping.py b/index/mapping.py index 5f35013..307ed2d 100644 --- a/index/mapping.py +++ b/index/mapping.py @@ -2,7 +2,7 @@ import numpy as np from index.embedding import EmbeddingModel -from index.model import Terminology, Mapping, Concept, Variable +from index.db.model import Terminology, Mapping, Concept, Variable from index.parsing import MappingSource, DataDictionarySource, EmbeddingSource diff --git a/index/model.py b/index/model.py deleted file mode 100644 index c30eca5..0000000 --- a/index/model.py +++ /dev/null @@ -1,56 +0,0 @@ -import pandas as pd - - -class Terminology: - - def __int__(self, identifier: str, name: str): - self.identifier = identifier - self.name = name - - -class Concept: - - def __init__(self, identifier: str, terminology: Terminology): - self.identifier = identifier - self.terminology = terminology - - -class Embedding: - - def __init__(self, embedding: [float], source: str): - self.embedding = embedding - self.source = source - - def to_dataframe(self): - return pd.DataFrame(self.embedding, columns=[self.source]) - - -class Variable: - - def __init__( - self, name: str, description: str, source: str, embedding: Embedding = None - ): - self.name = name - self.description = description - self.source = source - self.embedding = embedding - - -class Mapping: - - def __init__(self, concept: Concept, variable: Variable, source: str): - self.concept = concept - self.variable = variable - self.source = source - - def __eq__(self, other): - return ( - self.concept.identifier == other.concept.identifier - and self.variable.name == other.variable.name - ) - - def __hash__(self): - return hash((self.concept.identifier, self.variable.name)) - - def __str__(self): - return f"{self.variable.name} ({self.variable.description}) -> {self.concept.identifier}" diff --git a/index/visualisation.py b/index/visualisation.py index f88d9fe..b9ea0ec 100644 --- a/index/visualisation.py +++ b/index/visualisation.py @@ -14,8 +14,8 @@ class PlotSide(Enum): - LEFT = (1,) - RIGHT = (2,) + LEFT = 1, + RIGHT = 2, BOTH = 3 @@ -43,11 +43,7 @@ def enrichment_plot( save_plot=False, save_dir="resources/results/plots", ): - if ( - len(acc_gpt) != len(acc_fuzzy) - or len(acc_gpt) != len(acc_mpnet) - or len(acc_mpnet) != len(acc_fuzzy) - ): + if not (len(acc_gpt) == len(acc_fuzzy) == len(acc_mpnet)): raise ValueError( "acc_gpt, acc_mpnet and acc_fuzzy should be of the same length!" ) @@ -82,43 +78,17 @@ def concat_embeddings(tables1: [MappingTable], tables2: [MappingTable]): tables1_cleaned = [copy.deepcopy(table) for table in tables1] tables2_cleaned = [copy.deepcopy(table) for table in tables2] for table1, table2 in zip(tables1_cleaned, tables2_cleaned): - table1.joined_mapping_table.dropna( - subset=["embedding", "description"], inplace=True - ) - table2.joined_mapping_table.dropna( - subset=["embedding", "description"], inplace=True - ) - vectors_tables1 = np.concatenate( - [table.get_embeddings_numpy() for table in tables1_cleaned] - ) - vectors_tables2 = np.concatenate( - [table.get_embeddings_numpy() for table in tables2_cleaned] - ) - descriptions_table1 = np.concatenate( - [table.joined_mapping_table["description"] for table in tables1_cleaned] - ) - descriptions_table2 = np.concatenate( - [table.joined_mapping_table["description"] for table in tables2_cleaned] - ) - boundaries1 = np.array( - [ - table.joined_mapping_table["embedding"].index.size - for table in tables1_cleaned - ] - ) - boundaries2 = np.array( - [ - table.joined_mapping_table["embedding"].index.size - for table in tables2_cleaned - ] - ) + table1.joined_mapping_table.dropna(subset=["embedding", "description"], inplace=True) + table2.joined_mapping_table.dropna(subset=["embedding", "description"], inplace=True) + vectors_tables1 = np.concatenate([table.get_embeddings_numpy() for table in tables1_cleaned]) + vectors_tables2 = np.concatenate([table.get_embeddings_numpy() for table in tables2_cleaned]) + descriptions_table1 = np.concatenate([table.joined_mapping_table["description"] for table in tables1_cleaned]) + descriptions_table2 = np.concatenate([table.joined_mapping_table["description"] for table in tables2_cleaned]) + boundaries1 = np.array([table.joined_mapping_table["embedding"].index.size for table in tables1_cleaned]) + boundaries2 = np.array([table.joined_mapping_table["embedding"].index.size for table in tables2_cleaned]) vectors_concatenated = np.concatenate([vectors_tables1, vectors_tables2]) - descriptions_concatenated = np.concatenate( - [descriptions_table1, descriptions_table2] - ) - boundaries_concatenated = size_array_to_boundaries( - np.concatenate([boundaries1, boundaries2]) - ) + descriptions_concatenated = np.concatenate([descriptions_table1, descriptions_table2]) + boundaries_concatenated = size_array_to_boundaries(np.concatenate([boundaries1, boundaries2])) return vectors_concatenated, descriptions_concatenated, boundaries_concatenated From 3456fead3bb177def9cb58b7a9236e4157292842 Mon Sep 17 00:00:00 2001 From: Mehmet Can Ay Date: Wed, 14 Feb 2024 16:44:27 +0100 Subject: [PATCH 3/3] add: model --- index/db/__init__.py | 0 index/db/model.py | 56 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 index/db/__init__.py create mode 100644 index/db/model.py diff --git a/index/db/__init__.py b/index/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/index/db/model.py b/index/db/model.py new file mode 100644 index 0000000..c30eca5 --- /dev/null +++ b/index/db/model.py @@ -0,0 +1,56 @@ +import pandas as pd + + +class Terminology: + + def __int__(self, identifier: str, name: str): + self.identifier = identifier + self.name = name + + +class Concept: + + def __init__(self, identifier: str, terminology: Terminology): + self.identifier = identifier + self.terminology = terminology + + +class Embedding: + + def __init__(self, embedding: [float], source: str): + self.embedding = embedding + self.source = source + + def to_dataframe(self): + return pd.DataFrame(self.embedding, columns=[self.source]) + + +class Variable: + + def __init__( + self, name: str, description: str, source: str, embedding: Embedding = None + ): + self.name = name + self.description = description + self.source = source + self.embedding = embedding + + +class Mapping: + + def __init__(self, concept: Concept, variable: Variable, source: str): + self.concept = concept + self.variable = variable + self.source = source + + def __eq__(self, other): + return ( + self.concept.identifier == other.concept.identifier + and self.variable.name == other.variable.name + ) + + def __hash__(self): + return hash((self.concept.identifier, self.variable.name)) + + def __str__(self): + return f"{self.variable.name} ({self.variable.description}) -> {self.concept.identifier}"