From 9cdf7f6be900da5a7ce7c8a9a8e6f9a20ed3a049 Mon Sep 17 00:00:00 2001
From: Mehmet Can Ay <mehmettt.can.ay@gmail.com>
Date: Wed, 14 Feb 2024 14:52:33 +0100
Subject: [PATCH 1/3] update: MPNet implementation

---
 .gitignore                  |   3 +-
 index/conf.py               |  31 +-
 index/db/__init__.py        |   0
 index/embedding.py          |  32 +-
 index/evaluation.py         | 254 ++++++++++---
 index/main.py               | 704 ++++++++++++++++++++++++++++--------
 index/mapping.py            |  99 +++--
 index/{db => }/model.py     |   9 +-
 index/parsing.py            |  62 +++-
 index/visualisation.py      | 266 +++++++++++---
 requirements.txt            |  19 +-
 tests/test_evaluation.py    | 143 ++++++--
 tests/test_parser.py        |  19 +-
 tests/test_visualisation.py |  94 +++--
 14 files changed, 1371 insertions(+), 364 deletions(-)
 delete mode 100644 index/db/__init__.py
 rename index/{db => }/model.py (81%)

diff --git a/.gitignore b/.gitignore
index ff2ab48..feee596 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,4 +161,5 @@ cython_debug/
 #.idea/
 
 gptstew/.env!/gptstew/resources/
-.idea
\ No newline at end of file
+.idea
+.vscode
\ No newline at end of file
diff --git a/index/conf.py b/index/conf.py
index 824b22e..1b0bb3d 100644
--- a/index/conf.py
+++ b/index/conf.py
@@ -10,10 +10,29 @@
 BIOFIND_DICT_SRC = "resources/dictionaries/pd/biofind.csv"
 BIOFIND_EMBEDDINGS_SRC = "resources/embeddings/biofind.csv"
 
-COLORS_AD = {'adni': '#d62728', 'aibl': '#ff7f0e', 'emif': '#8c564b', 'jadni': '#7f7f7f',
-             'a4': '#aec7e8', 'dod-adni': '#ffbb78', 'prevent-ad': '#98df8a', 'arwibo': '#ff9896',
-             'i-adni': '#c5b0d5', 'edsd': '#c49c94', 'pharmacog': '#c7c7c7',
-             'vita': '#bcbd22', 'abvib': '#e0d9e2', 'ad-mapper': '#800000'}
+COLORS_AD = {
+    "adni": "#d62728",
+    "aibl": "#ff7f0e",
+    "emif": "#8c564b",
+    "jadni": "#7f7f7f",
+    "a4": "#aec7e8",
+    "dod-adni": "#ffbb78",
+    "prevent-ad": "#98df8a",
+    "arwibo": "#ff9896",
+    "i-adni": "#c5b0d5",
+    "edsd": "#c49c94",
+    "pharmacog": "#c7c7c7",
+    "vita": "#bcbd22",
+    "abvib": "#e0d9e2",
+    "ad-mapper": "#800000",
+}
 
-COLORS_PD = {'opdc': '#1f77b4', 'tpd': '#e377c2', 'biofind': '#9edae5', 'lrrk2': '#f7b6d2', 'luxpark': '#2ca02c',
-             'ppmi': '#9467bd', 'passionate': '#00ff00'}
+COLORS_PD = {
+    "opdc": "#1f77b4",
+    "tpd": "#e377c2",
+    "biofind": "#9edae5",
+    "lrrk2": "#f7b6d2",
+    "luxpark": "#2ca02c",
+    "ppmi": "#9467bd",
+    "passionate": "#00ff00",
+}
diff --git a/index/db/__init__.py b/index/db/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/index/embedding.py b/index/embedding.py
index e60bc41..be42cc3 100644
--- a/index/embedding.py
+++ b/index/embedding.py
@@ -2,10 +2,10 @@
 from abc import ABC
 import numpy as np
 import openai
+from sentence_transformers import SentenceTransformer
 
 
 class EmbeddingModel(ABC):
-
     def get_embedding(self, text: str) -> [float]:
         pass
 
@@ -14,7 +14,6 @@ def get_embeddings(self, messages: [str]) -> [[float]]:
 
 
 class GPT4Adapter(EmbeddingModel):
-
     def __init__(self, api_key: str):
         self.api_key = api_key
         openai.api_key = api_key
@@ -28,7 +27,9 @@ def get_embedding(self, text: str, model="text-embedding-ada-002"):
                 return None
             if isinstance(text, str):
                 text = text.replace("\n", " ")
-            return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']
+            return openai.Embedding.create(input=[text], model=model)["data"][0][
+                "embedding"
+            ]
         except Exception as e:
             logging.error(f"Error getting embedding for {text}: {e}")
             return None
@@ -36,11 +37,32 @@ def get_embedding(self, text: str, model="text-embedding-ada-002"):
     def get_embeddings(self, messages: [str], model="text-embedding-ada-002"):
         # store index of nan entries
         response = openai.Embedding.create(input=messages, model=model)
-        return [item['embedding'] for item in response['data']]
+        return [item["embedding"] for item in response["data"]]
 
 
-class TextEmbedding:
+class MPNetAdapter(EmbeddingModel):
+    def __init__(self):
+        logging.getLogger().setLevel(logging.INFO)
 
+    def get_embedding(self, text: str, model="sentence-transformers/all-mpnet-base-v2"):
+        mpnet_model = SentenceTransformer(model)
+        logging.info(f"Getting embedding for {text}")
+        try:
+            if text is None or text == "" or text is np.nan:
+                logging.warn(f"Empty text passed to get_embedding")
+                return None
+            if isinstance(text, str):
+                text = text.replace("\n", " ")
+            return mpnet_model.encode(text)
+        except Exception as e:
+            logging.error(f"Error getting embedding for {text}: {e}")
+            return None
+
+    def get_embeddings(self, messages: [str]) -> [[float]]:
+        return [self.get_embedding(msg) for msg in messages]
+
+
+class TextEmbedding:
     def __init__(self, text: str, embedding: [float]):
         self.text = text
         self.embedding = embedding
diff --git a/index/evaluation.py b/index/evaluation.py
index e6a006c..8c9160b 100644
--- a/index/evaluation.py
+++ b/index/evaluation.py
@@ -3,17 +3,24 @@
 from thefuzz import process
 import pandas as pd
 import numpy as np
+from scipy.spatial import distance
+from sklearn.metrics.pairwise import cosine_distances
 
 from index.mapping import MappingTable
 
 
 class MatchingMethod(Enum):
-    EUCLIDEAN_EMBEDDING_DISTANCE = 1,
-    FUZZY_STRING_MATCHING = 2
+    EUCLIDEAN_EMBEDDING_DISTANCE = (1,)
+    FUZZY_STRING_MATCHING = (2,)
+    COSINE_EMBEDDING_DISTANCE = 3
 
 
-def enrichment_analysis(source_table: MappingTable, target_table: MappingTable, max_cumulative_match_rank: int = 10,
-                        matching_method=MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE) -> np.ndarray:
+def enrichment_analysis(
+    source_table: MappingTable,
+    target_table: MappingTable,
+    max_cumulative_match_rank: int = 10,
+    matching_method=MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE,
+) -> np.ndarray:
     """
     Calculate accuracy for the n the closest matches for two mapping tables
 
@@ -29,17 +36,26 @@ def enrichment_analysis(source_table: MappingTable, target_table: MappingTable,
     # not every variable can be matched
     max_matches = 0
     # clean up source and target table (missing embeddings, descriptions etc.)
-    source_table.joined_mapping_table.drop_duplicates(subset=['variable'], keep='first', inplace=True)
+    source_table.joined_mapping_table.drop_duplicates(
+        subset=["variable"], keep="first", inplace=True
+    )
     source_table.joined_mapping_table.dropna(subset=["description"], inplace=True)
     target_table.joined_mapping_table.dropna(subset=["description"], inplace=True)
-    if matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE:
+    if (
+        matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE
+        or matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE
+    ):
         source_table.joined_mapping_table.dropna(subset=["embedding"], inplace=True)
         target_table.joined_mapping_table.dropna(subset=["embedding"], inplace=True)
     # re-index to account for dropped rows
-    target_table.joined_mapping_table = target_table.joined_mapping_table.reset_index(drop=True)
+    target_table.joined_mapping_table = target_table.joined_mapping_table.reset_index(
+        drop=True
+    )
     for idx, source_table_row in source_table.joined_mapping_table.iterrows():
         correct_target_index = target_table.joined_mapping_table[
-            target_table.joined_mapping_table["identifier"] == source_table_row["identifier"]].index
+            target_table.joined_mapping_table["identifier"]
+            == source_table_row["identifier"]
+        ].index
         if len(correct_target_index) == 0:
             # can not be matched -> skip
             continue
@@ -51,23 +67,46 @@ def enrichment_analysis(source_table: MappingTable, target_table: MappingTable,
             if matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE:
                 source_table_embedding = source_table_row["embedding"]
                 target_table_embedding = target_table_row["embedding"]
-                distances.append(np.linalg.norm(np.array(source_table_embedding) - np.array(target_table_embedding)))
+                distances.append(
+                    np.linalg.norm(
+                        np.array(source_table_embedding)
+                        - np.array(target_table_embedding)
+                    )
+                )
+            elif matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE:
+                source_table_embedding = np.array(source_table_row["embedding"])
+                target_table_embedding = np.array(target_table_row["embedding"])
+                distances.append(
+                    distance.cosine(source_table_embedding, target_table_embedding)
+                )
             elif matching_method == MatchingMethod.FUZZY_STRING_MATCHING:
                 source_table_description = source_table_row["description"]
                 target_table_description = target_table_row["description"]
-                distances.append(100 - fuzz.ratio(source_table_description, target_table_description))
+                distances.append(
+                    100 - fuzz.ratio(source_table_description, target_table_description)
+                )
             else:
-                raise NotImplementedError("Specified matching method is not implemented!")
-        min_distance_indices = np.argsort(np.array(distances))[:max_cumulative_match_rank]
+                raise NotImplementedError(
+                    "Specified matching method is not implemented!"
+                )
+        min_distance_indices = np.argsort(np.array(distances))[
+            :max_cumulative_match_rank
+        ]
         for n in range(max_cumulative_match_rank):
             # (due to upper level concepts) there may be more than one correct mapping
-            if any(element in min_distance_indices[:n+1] for element in correct_target_index):
+            if any(
+                element in min_distance_indices[: n + 1]
+                for element in correct_target_index
+            ):
                 correct_matches[n] += 1
     return (correct_matches / max_matches).round(2)
 
 
-def match_closest_descriptions(source_table: MappingTable, target_table: MappingTable,
-                               matching_method=MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE) -> pd.DataFrame:
+def match_closest_descriptions(
+    source_table: MappingTable,
+    target_table: MappingTable,
+    matching_method=MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE,
+) -> pd.DataFrame:
     """
     Match descriptions from source table to target table based on the biggest similarity
 
@@ -79,45 +118,87 @@ def match_closest_descriptions(source_table: MappingTable, target_table: Mapping
     """
     # sometimes the same concept gets mapped against multiple concepts in CDM, resulting in artifacts in the results
     # -> drop duplicates, only keep first
-    source_table.joined_mapping_table.drop_duplicates(subset=['variable'], keep='first', inplace=True)
+    source_table.joined_mapping_table.drop_duplicates(
+        subset=["variable"], keep="first", inplace=True
+    )
     # remove rows from source and target that do not contain either a description (in general) or embedding (for gpt)
     source_table.joined_mapping_table.dropna(subset=["description"], inplace=True)
     target_table.joined_mapping_table.dropna(subset=["description"], inplace=True)
-    if matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE:
+    if (
+        matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE
+        or matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE
+    ):
         source_table.joined_mapping_table.dropna(subset=["embedding"], inplace=True)
         target_table.joined_mapping_table.dropna(subset=["embedding"], inplace=True)
     # method -> compute distance based on embeddings
-    if matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE:
-        if "embedding" not in source_table.joined_mapping_table.columns \
-                or "embedding" not in target_table.joined_mapping_table.columns:
+    if (
+        matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE
+        or matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE
+    ):
+        if (
+            "embedding" not in source_table.joined_mapping_table.columns
+            or "embedding" not in target_table.joined_mapping_table.columns
+        ):
             raise ValueError("Mapping tables must contain an 'embedding' column")
     # re-index to account for dropped rows
-    target_table.joined_mapping_table = target_table.joined_mapping_table.reset_index(drop=True)
+    target_table.joined_mapping_table = target_table.joined_mapping_table.reset_index(
+        drop=True
+    )
     # METHOD: Euclidean Distance based on embeddings
     if matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE:
-        if "embedding" not in source_table.joined_mapping_table.columns \
-                or "embedding" not in target_table.joined_mapping_table.columns:
+        if (
+            "embedding" not in source_table.joined_mapping_table.columns
+            or "embedding" not in target_table.joined_mapping_table.columns
+        ):
+            raise ValueError("Mapping tables must contain an 'embedding' column")
+        source_embeddings = source_table.get_embeddings_numpy()
+        target_embeddings = target_table.get_embeddings_numpy()
+        distance_matrix = np.linalg.norm(
+            source_embeddings[:, np.newaxis] - target_embeddings, axis=-1
+        )
+        closest_indices = np.argmin(distance_matrix, axis=1)
+        distances = np.min(distance_matrix, axis=1)
+        matched_target_descriptions = target_table.joined_mapping_table.loc[
+            closest_indices, "description"
+        ].tolist()
+    # METHOD: Cosine Distance based on embeddings
+    elif matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE:
+        if (
+            "embedding" not in source_table.joined_mapping_table.columns
+            or "embedding" not in target_table.joined_mapping_table.columns
+        ):
             raise ValueError("Mapping tables must contain an 'embedding' column")
         source_embeddings = source_table.get_embeddings_numpy()
         target_embeddings = target_table.get_embeddings_numpy()
-        distance_matrix = np.linalg.norm(source_embeddings[:, np.newaxis] - target_embeddings, axis=-1)
+        distance_matrix = cosine_distances(source_embeddings, target_embeddings)
         closest_indices = np.argmin(distance_matrix, axis=1)
         distances = np.min(distance_matrix, axis=1)
-        matched_target_descriptions = target_table.joined_mapping_table.loc[closest_indices, 'description'].tolist()
+        matched_target_descriptions = target_table.joined_mapping_table.loc[
+            closest_indices, "description"
+        ].tolist()
     # METHOD: Fuzzy String Matching based on Levenstein Distance
     elif matching_method == MatchingMethod.FUZZY_STRING_MATCHING:
-        if "description" not in source_table.joined_mapping_table.columns \
-                or "description" not in target_table.joined_mapping_table.columns:
+        if (
+            "description" not in source_table.joined_mapping_table.columns
+            or "description" not in target_table.joined_mapping_table.columns
+        ):
             raise ValueError("Mapping tables must contain an 'description' column")
-        source_descriptions = source_table.joined_mapping_table["description"].to_numpy()
-        target_descriptions = target_table.joined_mapping_table["description"].to_numpy()
-        target_descriptions_dict = {idx: el for idx, el in enumerate(target_descriptions)}
+        source_descriptions = source_table.joined_mapping_table[
+            "description"
+        ].to_numpy()
+        target_descriptions = target_table.joined_mapping_table[
+            "description"
+        ].to_numpy()
+        target_descriptions_dict = {
+            idx: el for idx, el in enumerate(target_descriptions)
+        }
         closest_indices = []
         distances = []
         matched_target_descriptions = []
         for source_description in source_descriptions:
-            matched_target_description, distance, target_idx = process.extractOne(source_description,
-                                                                                      target_descriptions_dict)
+            matched_target_description, distance, target_idx = process.extractOne(
+                source_description, target_descriptions_dict
+            )
             closest_indices.append(target_idx)
             matched_target_descriptions.append(matched_target_description)
             # it is not a distance but a score [0,100] in this case -> take inverse (+1 to avoid division by 0)
@@ -126,26 +207,37 @@ def match_closest_descriptions(source_table: MappingTable, target_table: Mapping
     else:
         raise ValueError("Specified Matching method is not implemented!")
     source_concept_label = source_table.joined_mapping_table["identifier"]
-    target_concept_label = target_table.joined_mapping_table.loc[closest_indices, 'identifier'].tolist()
+    target_concept_label = target_table.joined_mapping_table.loc[
+        closest_indices, "identifier"
+    ].tolist()
     source_variable = source_table.joined_mapping_table["variable"]
-    target_variable = target_table.joined_mapping_table.loc[closest_indices, 'variable'].tolist()
+    target_variable = target_table.joined_mapping_table.loc[
+        closest_indices, "variable"
+    ].tolist()
     correct = source_concept_label == target_concept_label
-    ground_truth_target_descriptions = get_ground_truth_target_descriptions(source_table.joined_mapping_table,
-                                                                            target_table.joined_mapping_table)
+    ground_truth_target_descriptions = get_ground_truth_target_descriptions(
+        source_table.joined_mapping_table, target_table.joined_mapping_table
+    )
     source_descriptions = source_table.joined_mapping_table["description"]
-    result = pd.DataFrame({"correct": correct,
-                           "source_variable": source_variable,
-                           "target_variable": target_variable,
-                           "source_concept_label": source_concept_label,
-                           "target_concept_label": target_concept_label,
-                           "source_description": source_descriptions,
-                           "matched_target_description": matched_target_descriptions,
-                           "ground_truth_target_description": ground_truth_target_descriptions,
-                           "distance": distances})
+    result = pd.DataFrame(
+        {
+            "correct": correct,
+            "source_variable": source_variable,
+            "target_variable": target_variable,
+            "source_concept_label": source_concept_label,
+            "target_concept_label": target_concept_label,
+            "source_description": source_descriptions,
+            "matched_target_description": matched_target_descriptions,
+            "ground_truth_target_description": ground_truth_target_descriptions,
+            "distance": distances,
+        }
+    )
     return result
 
 
-def get_ground_truth_target_descriptions(source_table: pd.DataFrame, target_table: pd.DataFrame) -> np.ndarray[str]:
+def get_ground_truth_target_descriptions(
+    source_table: pd.DataFrame, target_table: pd.DataFrame
+) -> np.ndarray[str]:
     """
     Get the ground truth target descriptions based on the matched identifiers
 
@@ -157,7 +249,9 @@ def get_ground_truth_target_descriptions(source_table: pd.DataFrame, target_tabl
     descriptions = []
     for source_id in source_table["identifier"]:
         try:
-            target_description = target_table.loc[target_table["identifier"] == source_id, "description"].iloc[0]
+            target_description = target_table.loc[
+                target_table["identifier"] == source_id, "description"
+            ].iloc[0]
             descriptions.append(target_description)
         except IndexError:
             descriptions.append(None)
@@ -177,3 +271,69 @@ def score_mappings(matches: pd.DataFrame) -> float:
     matches = matches[matches["target_concept_label"].notnull()]
     accuracy = matches["correct"].sum() / len(matches)
     return accuracy
+
+
+def evaluate(
+    datasets,
+    labels,
+    store_results=False,
+    model="gpt",
+    results_root_dir="resources/results/pd",
+):
+
+    if model == "gpt":
+        data_gpt = {}
+        data_fuzzy = {}
+        for idx, source in enumerate(datasets):
+            acc_gpt = []
+            acc_fuzzy = []
+            for idy, target in enumerate(datasets):
+                map_gpt = match_closest_descriptions(source, target)
+                map_fuzzy = match_closest_descriptions(
+                    source, target, matching_method=MatchingMethod.FUZZY_STRING_MATCHING
+                )
+                if target == "jadni":
+                    print("check")
+                if store_results:
+                    map_gpt.to_excel(
+                        results_root_dir
+                        + "/gpt_"
+                        + f"{labels[idx]}_to_{labels[idy]}.xlsx"
+                    )
+                    map_fuzzy.to_excel(
+                        results_root_dir
+                        + "/fuzzy_"
+                        + f"{labels[idx]}_to_{labels[idy]}.xlsx"
+                    )
+                acc_gpt.append(round(score_mappings(map_gpt), 2))
+                acc_fuzzy.append(round(score_mappings(map_fuzzy), 2))
+            data_gpt[labels[idx]] = acc_gpt
+            data_fuzzy[labels[idx]] = acc_fuzzy
+        # transpose to have from -> to | row -> column like in the paper
+        gpt = pd.DataFrame(data_gpt, index=labels).T
+        fuzzy = pd.DataFrame(data_fuzzy, index=labels).T
+        return gpt, fuzzy
+
+    elif model == "mpnet":
+        data_mpnet = {}
+        for idx, source in enumerate(datasets):
+            acc_mpnet = []
+            for idy, target in enumerate(datasets):
+                map_mpnet = match_closest_descriptions(
+                    source,
+                    target,
+                    matching_method=MatchingMethod.COSINE_EMBEDDING_DISTANCE,
+                )
+                if target == "jadni":
+                    print("check")
+                if store_results:
+                    map_mpnet.to_excel(
+                        results_root_dir
+                        + "/mpnet_"
+                        + f"{labels[idx]}_to_{labels[idy]}.xlsx"
+                    )
+                acc_mpnet.append(round(score_mappings(map_mpnet), 2))
+            data_mpnet[labels[idx]] = acc_mpnet
+        # transpose to have from -> to | row -> column like in the paper
+        mpnet = pd.DataFrame(data_mpnet, index=labels).T
+        return mpnet
diff --git a/index/main.py b/index/main.py
index 9684daf..08dbe96 100644
--- a/index/main.py
+++ b/index/main.py
@@ -1,189 +1,605 @@
 import os
+import sys
 
+sys.path.append("../")
 import pandas as pd
 
 from index import evaluation
-from index.conf import PD_CDM_SRC, PPMI_DICT_SRC, LUXPARK_DICT_SRC, BIOFIND_DICT_SRC, AD_CDM_SRC
-from index.embedding import GPT4Adapter
-from index.evaluation import match_closest_descriptions, MatchingMethod, enrichment_analysis
+from index.conf import (
+    PD_CDM_SRC,
+    PPMI_DICT_SRC,
+    LUXPARK_DICT_SRC,
+    BIOFIND_DICT_SRC,
+    AD_CDM_SRC,
+)
+from index.embedding import GPT4Adapter, MPNetAdapter
+from index.evaluation import (
+    match_closest_descriptions,
+    MatchingMethod,
+    enrichment_analysis,
+    evaluate,
+)
 from index.mapping import MappingTable
 from index.parsing import MappingSource, DataDictionarySource
 from dotenv import load_dotenv
 
-from index.visualisation import scatter_plot_two_distributions, enrichment_plot, scatter_plot_all_cohorts
+from index.visualisation import (
+    scatter_plot_two_distributions,
+    enrichment_plot,
+    scatter_plot_all_cohorts,
+    bar_chart_average_acc_two_distributions,
+)
 
 EVAL_PD = True
 EVAL_AD = True
 
 load_dotenv()
-gpt4 = GPT4Adapter(api_key=os.getenv('GPT_KEY'))
-
-
-def evaluate(datasets, labels, store_results=False, results_root_dir="resources/results/pd"):
-    data_gpt = {}
-    data_fuzzy = {}
-    for idx, source in enumerate(datasets):
-        acc_gpt = []
-        acc_fuzzy = []
-        for idy, target in enumerate(datasets):
-            map_gpt = match_closest_descriptions(source, target)
-            map_fuzzy = match_closest_descriptions(source, target, matching_method=MatchingMethod.FUZZY_STRING_MATCHING)
-            if target == "jadni":
-                print("check")
-            if store_results:
-                map_gpt.to_excel(results_root_dir + "/gpt_" + f"{labels[idx]}_to_{labels[idy]}.xlsx")
-                map_fuzzy.to_excel(results_root_dir + "/fuzzy_" + f"{labels[idx]}_to_{labels[idy]}.xlsx")
-            acc_gpt.append(round(evaluation.score_mappings(map_gpt), 2))
-            acc_fuzzy.append(round(evaluation.score_mappings(map_fuzzy), 2))
-        data_gpt[labels[idx]] = acc_gpt
-        data_fuzzy[labels[idx]] = acc_fuzzy
-    # transpose to have from -> to | row -> column like in the paper
-    gpt = pd.DataFrame(data_gpt, index=labels).T
-    fuzzy = pd.DataFrame(data_fuzzy, index=labels).T
-    return gpt, fuzzy
-
+gpt4 = GPT4Adapter(api_key=os.getenv("GPT_KEY"))  # type: ignore
+mpnet = MPNetAdapter()
 
 # PD Mappings
 
 if EVAL_PD:
-    cdm_pd = MappingTable(MappingSource(PD_CDM_SRC, "Feature", "CURIE"))
-    cdm_pd.joined_mapping_table["identifier"].to_csv("resources/cdm_curie.csv", index=False)
-    cdm_pd.add_descriptions(DataDictionarySource(PD_CDM_SRC, "Feature", "Definition"))
-    cdm_pd.compute_embeddings(gpt4)
-
-    ppmi = MappingTable(MappingSource(PD_CDM_SRC, "PPMI", "CURIE"))
-    ppmi.add_descriptions(DataDictionarySource(PPMI_DICT_SRC, "ITM_NAME", "DSCR"))
-    ppmi.compute_embeddings(gpt4)
-
-    luxpark = MappingTable(MappingSource(PD_CDM_SRC, "LuxPARK", "CURIE"))
-    luxpark.add_descriptions(DataDictionarySource(LUXPARK_DICT_SRC, "Variable / Field Name", "Field Label"))
-    luxpark.compute_embeddings(gpt4)
-
-    biofind = MappingTable(MappingSource(PD_CDM_SRC, "BIOFIND", "CURIE"))
-    biofind.add_descriptions(DataDictionarySource(BIOFIND_DICT_SRC, "ITM_NAME", "DSCR"))
-    biofind.compute_embeddings(gpt4)
-
-    lrrk2 = MappingTable(MappingSource(PD_CDM_SRC, "LRRK2", "CURIE"))
-    lrrk2.add_descriptions(DataDictionarySource("resources/dictionaries/pd/LRRK2.xlsx", "Variable", "Label"))
-    lrrk2.compute_embeddings(gpt4)
-
-    opdc = MappingTable(MappingSource(PD_CDM_SRC, "OPDC", "CURIE"))
-    opdc.add_descriptions(
-        DataDictionarySource("resources/dictionaries/pd/OPDC.csv", "Variable Name", "Variable description"))
-    opdc.compute_embeddings(gpt4)
-
-    tpd = MappingTable(MappingSource(PD_CDM_SRC, "TPD", "CURIE"))
-    tpd.add_descriptions(
-        DataDictionarySource("resources/dictionaries/pd/TPD.csv", "Variable Name", "Variable description"))
-    tpd.compute_embeddings(gpt4)
-
-    pd_datesets = [opdc, tpd, biofind, lrrk2, luxpark, ppmi, cdm_pd]
-    pd_datasets_labels = ["OPDC", "TPD", "Biofind", "LRRK2", "LuxPARK", "PPMI", "PASSIONATE"]
+    cdm_pd_gpt = MappingTable(MappingSource(PD_CDM_SRC, "Feature", "CURIE"))
+    cdm_pd_gpt.joined_mapping_table["identifier"].to_csv(
+        "resources/cdm_curie.csv", index=False
+    )
+    cdm_pd_gpt.add_descriptions(
+        DataDictionarySource(PD_CDM_SRC, "Feature", "Definition")
+    )
+    cdm_pd_gpt.compute_embeddings(gpt4)
+
+    cdm_pd_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "Feature", "CURIE"))
+    cdm_pd_mpnet.joined_mapping_table["identifier"].to_csv(
+        "resources/cdm_curie.csv", index=False
+    )
+    cdm_pd_mpnet.add_descriptions(
+        DataDictionarySource(PD_CDM_SRC, "Feature", "Definition")
+    )
+    cdm_pd_mpnet.compute_embeddings(mpnet)
+
+    ppmi_gpt = MappingTable(MappingSource(PD_CDM_SRC, "PPMI", "CURIE"))
+    ppmi_gpt.add_descriptions(DataDictionarySource(PPMI_DICT_SRC, "ITM_NAME", "DSCR"))
+    ppmi_gpt.compute_embeddings(gpt4)
+
+    ppmi_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "PPMI", "CURIE"))
+    ppmi_mpnet.add_descriptions(DataDictionarySource(PPMI_DICT_SRC, "ITM_NAME", "DSCR"))
+    ppmi_mpnet.compute_embeddings(mpnet)
+
+    luxpark_gpt = MappingTable(MappingSource(PD_CDM_SRC, "LuxPARK", "CURIE"))
+    luxpark_gpt.add_descriptions(
+        DataDictionarySource(LUXPARK_DICT_SRC, "Variable / Field Name", "Field Label")
+    )
+    luxpark_gpt.compute_embeddings(gpt4)
+
+    luxpark_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "LuxPARK", "CURIE"))
+    luxpark_mpnet.add_descriptions(
+        DataDictionarySource(LUXPARK_DICT_SRC, "Variable / Field Name", "Field Label")
+    )
+    luxpark_mpnet.compute_embeddings(mpnet)
+
+    biofind_gpt = MappingTable(MappingSource(PD_CDM_SRC, "BIOFIND", "CURIE"))
+    biofind_gpt.add_descriptions(
+        DataDictionarySource(BIOFIND_DICT_SRC, "ITM_NAME", "DSCR")
+    )
+    biofind_gpt.compute_embeddings(gpt4)
+
+    biofind_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "BIOFIND", "CURIE"))
+    biofind_mpnet.add_descriptions(
+        DataDictionarySource(BIOFIND_DICT_SRC, "ITM_NAME", "DSCR")
+    )
+    biofind_mpnet.compute_embeddings(mpnet)
+
+    lrrk2_gpt = MappingTable(MappingSource(PD_CDM_SRC, "LRRK2", "CURIE"))
+    lrrk2_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/pd/LRRK2.xlsx", "Variable", "Label"
+        )
+    )
+    lrrk2_gpt.compute_embeddings(gpt4)
+
+    lrrk2_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "LRRK2", "CURIE"))
+    lrrk2_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/pd/LRRK2.xlsx", "Variable", "Label"
+        )
+    )
+    lrrk2_mpnet.compute_embeddings(mpnet)
+
+    opdc_gpt = MappingTable(MappingSource(PD_CDM_SRC, "OPDC", "CURIE"))
+    opdc_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/pd/OPDC.csv",
+            "Variable Name",
+            "Variable description",
+        )
+    )
+    opdc_gpt.compute_embeddings(gpt4)
+
+    opdc_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "OPDC", "CURIE"))
+    opdc_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/pd/OPDC.csv",
+            "Variable Name",
+            "Variable description",
+        )
+    )
+    opdc_mpnet.compute_embeddings(mpnet)
+
+    tpd_gpt = MappingTable(MappingSource(PD_CDM_SRC, "TPD", "CURIE"))
+    tpd_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/pd/TPD.csv",
+            "Variable Name",
+            "Variable description",
+        )
+    )
+    tpd_gpt.compute_embeddings(gpt4)
+
+    tpd_mpnet = MappingTable(MappingSource(PD_CDM_SRC, "TPD", "CURIE"))
+    tpd_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/pd/TPD.csv",
+            "Variable Name",
+            "Variable description",
+        )
+    )
+    tpd_mpnet.compute_embeddings(mpnet)
+
+    pd_datasets_gpt = [
+        opdc_gpt,
+        tpd_gpt,
+        biofind_gpt,
+        lrrk2_gpt,
+        luxpark_gpt,
+        ppmi_gpt,
+        cdm_pd_gpt,
+    ]
+    pd_datasets_mpnet = [
+        opdc_mpnet,
+        tpd_mpnet,
+        biofind_mpnet,
+        lrrk2_mpnet,
+        luxpark_mpnet,
+        ppmi_mpnet,
+        cdm_pd_mpnet,
+    ]
+    pd_datasets_labels = [
+        "OPDC",
+        "PRoBaND",
+        "BIOFIND",
+        "LCC",
+        "LuxPARK",
+        "PPMI",
+        "PASSIONATE",
+    ]
 
     # enrichment analysis
-    luxpark_passionate_enrichment_gpt = enrichment_analysis(luxpark, cdm_pd, 20,
-                                                            MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE)
-    luxpark_passionate_enrichment_fuzzy = enrichment_analysis(luxpark, cdm_pd, 20, MatchingMethod.FUZZY_STRING_MATCHING)
+    luxpark_passionate_enrichment_gpt = enrichment_analysis(
+        luxpark_gpt, cdm_pd_gpt, 20, MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE
+    )
+    luxpark_passionate_enrichment_mpnet = enrichment_analysis(
+        luxpark_mpnet, cdm_pd_mpnet, 20, MatchingMethod.COSINE_EMBEDDING_DISTANCE
+    )
+    luxpark_passionate_enrichment_fuzzy = enrichment_analysis(
+        luxpark_gpt, cdm_pd_gpt, 20, MatchingMethod.FUZZY_STRING_MATCHING
+    )
     label1 = "Enrichment Plot LuxPARK to CDM"
-    ppmi_passionate_enrichment_gpt = enrichment_analysis(ppmi, cdm_pd, 20, MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE)
-    ppmi_passionate_enrichment_fuzzy = enrichment_analysis(ppmi, cdm_pd, 20, MatchingMethod.FUZZY_STRING_MATCHING)
+    ppmi_passionate_enrichment_gpt = enrichment_analysis(
+        ppmi_gpt, cdm_pd_gpt, 20, MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE
+    )
+    ppmi_passionate_enrichment_mpnet = enrichment_analysis(
+        ppmi_mpnet, cdm_pd_mpnet, 20, MatchingMethod.COSINE_EMBEDDING_DISTANCE
+    )
+    ppmi_passionate_enrichment_fuzzy = enrichment_analysis(
+        ppmi_gpt, cdm_pd_gpt, 20, MatchingMethod.FUZZY_STRING_MATCHING
+    )
     label2 = "Enrichment Plot PPMI to CDM"
-    enrichment_plot(luxpark_passionate_enrichment_gpt, luxpark_passionate_enrichment_fuzzy, label1, save_plot=True)
-    enrichment_plot(ppmi_passionate_enrichment_gpt, ppmi_passionate_enrichment_fuzzy, label2, save_plot=True)
+    ppmi_luxpark_enrichment_gpt = enrichment_analysis(
+        ppmi_gpt, luxpark_gpt, 20, MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE
+    )
+    ppmi_luxpark_enrichment_mpnet = enrichment_analysis(
+        ppmi_mpnet, luxpark_mpnet, 20, MatchingMethod.COSINE_EMBEDDING_DISTANCE
+    )
+    ppmi_luxpark_enrichment_fuzzy = enrichment_analysis(
+        ppmi_gpt, luxpark_gpt, 20, MatchingMethod.FUZZY_STRING_MATCHING
+    )
+    label3 = "Enrichment Plot PPMI to LuxPARK"
+    enrichment_plot(
+        luxpark_passionate_enrichment_gpt,
+        luxpark_passionate_enrichment_mpnet,
+        luxpark_passionate_enrichment_fuzzy,
+        label1,
+        save_plot=True,
+    )
+    enrichment_plot(
+        ppmi_passionate_enrichment_gpt,
+        ppmi_passionate_enrichment_mpnet,
+        ppmi_passionate_enrichment_fuzzy,
+        label2,
+        save_plot=True,
+    )
+    enrichment_plot(
+        ppmi_luxpark_enrichment_gpt,
+        ppmi_luxpark_enrichment_mpnet,
+        ppmi_luxpark_enrichment_fuzzy,
+        label3,
+        save_plot=True,
+    )
     print(luxpark_passionate_enrichment_gpt)
+    print(luxpark_passionate_enrichment_mpnet)
     print(luxpark_passionate_enrichment_fuzzy)
     print(ppmi_passionate_enrichment_gpt)
+    print(ppmi_passionate_enrichment_mpnet)
     print(ppmi_passionate_enrichment_fuzzy)
+    print(ppmi_luxpark_enrichment_gpt)
+    print(ppmi_luxpark_enrichment_mpnet)
+    print(ppmi_luxpark_enrichment_fuzzy)
+
+    gpt_table1, fuzzy_table1 = evaluate(
+        pd_datasets_gpt,
+        pd_datasets_labels,
+        store_results=True,
+        model="gpt",
+        results_root_dir="./resources/results/pd",
+    )
+
+    mpnet_table1 = evaluate(
+        pd_datasets_mpnet,
+        pd_datasets_labels,
+        store_results=True,
+        model="mpnet",
+        results_root_dir="./resources/results/pd",
+    )
 
-    gpt_table, fuzzy_table = evaluate(pd_datesets, pd_datasets_labels)
     print("PD RESULTS:")
-    print(gpt_table)
+    print("GPT")
+    print("-----------")
+    print(gpt_table1)
     print("-----------")
-    print(fuzzy_table)
+    print("MPNet")
+    print("-----------")
+    print(mpnet_table1)
+    print("-----------")
+    print("Fuzzy")
+    print("-----------")
+    print(fuzzy_table1)
     print("-----------")
 
 # AD Mappings
 
 if EVAL_AD:
-    cdm_ad = cdm_pd = MappingTable(MappingSource(AD_CDM_SRC, "Feature", "CURIE"))
-    cdm_ad.add_descriptions(DataDictionarySource(PD_CDM_SRC, "Feature", "Definition"))
-    cdm_ad.compute_embeddings(gpt4)
-
-    a4 = MappingTable(MappingSource(AD_CDM_SRC, "A4", "CURIE"))
-    a4.add_descriptions(DataDictionarySource("resources/dictionaries/ad/a4.csv", "FLDNAME", "TEXT"))
-    a4.compute_embeddings(gpt4)
-
-    abvib = MappingTable(MappingSource(AD_CDM_SRC, "ABVIB", "CURIE"))
-    abvib.add_descriptions(DataDictionarySource("resources/dictionaries/ad/abvib.csv", "variable_name", "description"))
-    abvib.compute_embeddings(gpt4)
-
-    adni = MappingTable(MappingSource(AD_CDM_SRC, "ADNI", "CURIE"))
-    adni.add_descriptions(DataDictionarySource("resources/dictionaries/ad/adni.csv", "FLDNAME", "TEXT"))
-    adni.compute_embeddings(gpt4)
-
-    aibl = MappingTable(MappingSource(AD_CDM_SRC, "AIBL", "CURIE"))
-    aibl.add_descriptions(DataDictionarySource("resources/dictionaries/ad/aibl.csv", "Name", "Description"))
-    aibl.compute_embeddings(gpt4)
-
-    arwibo = MappingTable(MappingSource(AD_CDM_SRC, "ARWIBO", "CURIE"))
-    arwibo.add_descriptions(
-        DataDictionarySource("resources/dictionaries/ad/arwibo.csv", "Variable_Name", "Element_description"))
-    arwibo.compute_embeddings(gpt4)
-
-    dod_adni = MappingTable(MappingSource(AD_CDM_SRC, "DOD-ADNI", "CURIE"))
+    cdm_ad_gpt = cdm_pd_gpt = MappingTable(
+        MappingSource(AD_CDM_SRC, "Feature", "CURIE")
+    )
+    cdm_ad_gpt.add_descriptions(
+        DataDictionarySource(PD_CDM_SRC, "Feature", "Definition")
+    )
+    cdm_ad_gpt.compute_embeddings(gpt4)
+
+    cdm_ad_mpnet = cdm_pd_gpt = MappingTable(
+        MappingSource(AD_CDM_SRC, "Feature", "CURIE")
+    )
+    cdm_ad_mpnet.add_descriptions(
+        DataDictionarySource(PD_CDM_SRC, "Feature", "Definition")
+    )
+    cdm_ad_mpnet.compute_embeddings(mpnet)
+
+    a4_gpt = MappingTable(MappingSource(AD_CDM_SRC, "A4", "CURIE"))
+    a4_gpt.add_descriptions(
+        DataDictionarySource("resources/dictionaries/ad/a4.csv", "FLDNAME", "TEXT")
+    )
+    a4_gpt.compute_embeddings(gpt4)
+
+    a4_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "A4", "CURIE"))
+    a4_mpnet.add_descriptions(
+        DataDictionarySource("resources/dictionaries/ad/a4.csv", "FLDNAME", "TEXT")
+    )
+    a4_mpnet.compute_embeddings(mpnet)
+
+    abvib_gpt = MappingTable(MappingSource(AD_CDM_SRC, "ABVIB", "CURIE"))
+    abvib_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/abvib.csv",
+            "variable_name",
+            "description",
+        )
+    )
+    abvib_gpt.compute_embeddings(gpt4)
+
+    abvib_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "ABVIB", "CURIE"))
+    abvib_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/abvib.csv",
+            "variable_name",
+            "description",
+        )
+    )
+    abvib_mpnet.compute_embeddings(mpnet)
+
+    adni_gpt = MappingTable(MappingSource(AD_CDM_SRC, "ADNI", "CURIE"))
+    adni_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/ADNIMERGE_DICT_27Nov2023 2.csv",
+            "FLDNAME",
+            "TEXT",
+        )
+    )
+    adni_gpt.compute_embeddings(gpt4)
+
+    adni_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "ADNI", "CURIE"))
+    adni_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/ADNIMERGE_DICT_27Nov2023 2.csv",
+            "FLDNAME",
+            "TEXT",
+        )
+    )
+    adni_mpnet.compute_embeddings(mpnet)
+
+    aibl_gpt = MappingTable(MappingSource(AD_CDM_SRC, "AIBL", "CURIE"))
+    aibl_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/aibl.csv", "Name", "Description"
+        )
+    )
+    aibl_gpt.compute_embeddings(gpt4)
+
+    aibl_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "AIBL", "CURIE"))
+    aibl_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/aibl.csv", "Name", "Description"
+        )
+    )
+    aibl_mpnet.compute_embeddings(mpnet)
+
+    arwibo_gpt = MappingTable(MappingSource(AD_CDM_SRC, "ARWIBO", "CURIE"))
+    arwibo_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/arwibo.csv",
+            "Variable_Name",
+            "Element_description",
+        )
+    )
+    arwibo_gpt.compute_embeddings(gpt4)
+
+    arwibo_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "ARWIBO", "CURIE"))
+    arwibo_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/arwibo.csv",
+            "Variable_Name",
+            "Element_description",
+        )
+    )
+    arwibo_mpnet.compute_embeddings(mpnet)
+
+    dod_adni_gpt = MappingTable(MappingSource(AD_CDM_SRC, "DOD-ADNI", "CURIE"))
     # TODO most descriptions missing
-    dod_adni.add_descriptions(DataDictionarySource("resources/dictionaries/ad/dod-adni.csv", "FLDNAME", "TEXT"))
-    dod_adni.compute_embeddings(gpt4)
-
-    edsd = MappingTable(MappingSource(AD_CDM_SRC, "EDSD", "CURIE"))
-    edsd.add_descriptions(
-        DataDictionarySource("resources/dictionaries/ad/edsd.xlsx", "Variable_Name", "Element_description"))
-    edsd.compute_embeddings(gpt4)
-
-    emif = MappingTable(MappingSource(AD_CDM_SRC, "EMIF", "CURIE"))
-    emif.add_descriptions(DataDictionarySource("resources/dictionaries/ad/emif.xlsx", "Variable", "Description"))
-    emif.compute_embeddings(gpt4)
-
-    i_adni = MappingTable(MappingSource(AD_CDM_SRC, "I-ADNI", "CURIE"))
+    dod_adni_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/dod-adni.csv", "FLDNAME", "TEXT"
+        )
+    )
+    dod_adni_gpt.compute_embeddings(gpt4)
+
+    dod_adni_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "DOD-ADNI", "CURIE"))
+    # TODO most descriptions missing
+    dod_adni_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/dod-adni.csv", "FLDNAME", "TEXT"
+        )
+    )
+    dod_adni_mpnet.compute_embeddings(mpnet)
+
+    edsd_gpt = MappingTable(MappingSource(AD_CDM_SRC, "EDSD", "CURIE"))
+    edsd_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/edsd.xlsx",
+            "Variable_Name",
+            "Element_description",
+        )
+    )
+    edsd_gpt.compute_embeddings(gpt4)
+
+    edsd_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "EDSD", "CURIE"))
+    edsd_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/edsd.xlsx",
+            "Variable_Name",
+            "Element_description",
+        )
+    )
+    edsd_mpnet.compute_embeddings(mpnet)
+
+    emif_gpt = MappingTable(MappingSource(AD_CDM_SRC, "EMIF", "CURIE"))
+    emif_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/emif.xlsx", "Variable", "Description"
+        )
+    )
+    emif_gpt.compute_embeddings(gpt4)
+
+    emif_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "EMIF", "CURIE"))
+    emif_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/emif.xlsx", "Variable", "Description"
+        )
+    )
+    emif_mpnet.compute_embeddings(mpnet)
+
+    i_adni_gpt = MappingTable(MappingSource(AD_CDM_SRC, "I-ADNI", "CURIE"))
     # TODO about half of descriptions missing
-    i_adni.add_descriptions(DataDictionarySource("resources/dictionaries/ad/i-adni.csv", "acronym", "variable"))
-    i_adni.compute_embeddings(gpt4)
-
-    jadni = MappingTable(MappingSource(AD_CDM_SRC, "JADNI", "CURIE"))
-    jadni.add_descriptions(DataDictionarySource("resources/dictionaries/ad/jadni.tsv", "FLDNAME", "TEXT"))
-    jadni.compute_embeddings(gpt4)
-
-    pharmacog = MappingTable(MappingSource(AD_CDM_SRC, "PharmaCog", "CURIE"))
-    pharmacog.add_descriptions(
-        DataDictionarySource("resources/dictionaries/ad/pharmacog.csv", "Variable_Name", "Element_description"))
-    pharmacog.compute_embeddings(gpt4)
-
-    prevent_ad = MappingTable(MappingSource(AD_CDM_SRC, "PREVENT-AD", "CURIE"))
-    prevent_ad.add_descriptions(
-        DataDictionarySource("resources/dictionaries/ad/prevent-ad.csv", "variable", "description"))
-    prevent_ad.compute_embeddings(gpt4)
-
-    vita = MappingTable(MappingSource(AD_CDM_SRC, "VITA", "CURIE"))
-    vita.add_descriptions(
-        DataDictionarySource("resources/dictionaries/ad/vita.csv", "Variable_Name", "Element_description"))
-    vita.compute_embeddings(gpt4)
+    i_adni_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/i-adni.csv", "acronym", "variable"
+        )
+    )
+    i_adni_gpt.compute_embeddings(gpt4)
+
+    i_adni_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "I-ADNI", "CURIE"))
+    # TODO about half of descriptions missing
+    i_adni_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/i-adni.csv", "acronym", "variable"
+        )
+    )
+    i_adni_mpnet.compute_embeddings(mpnet)
+
+    jadni_gpt = MappingTable(MappingSource(AD_CDM_SRC, "JADNI", "CURIE"))
+    jadni_gpt.add_descriptions(
+        DataDictionarySource("resources/dictionaries/ad/jadni.tsv", "FLDNAME", "TEXT")
+    )
+    jadni_gpt.compute_embeddings(gpt4)
+
+    jadni_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "JADNI", "CURIE"))
+    jadni_mpnet.add_descriptions(
+        DataDictionarySource("resources/dictionaries/ad/jadni.tsv", "FLDNAME", "TEXT")
+    )
+    jadni_mpnet.compute_embeddings(mpnet)
+
+    pharmacog_gpt = MappingTable(MappingSource(AD_CDM_SRC, "PharmaCog", "CURIE"))
+    pharmacog_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/pharmacog.csv",
+            "Variable_Name",
+            "Element_description",
+        )
+    )
+    pharmacog_gpt.compute_embeddings(gpt4)
+
+    pharmacog_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "PharmaCog", "CURIE"))
+    pharmacog_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/pharmacog.csv",
+            "Variable_Name",
+            "Element_description",
+        )
+    )
+    pharmacog_mpnet.compute_embeddings(mpnet)
+
+    prevent_ad_gpt = MappingTable(MappingSource(AD_CDM_SRC, "PREVENT-AD", "CURIE"))
+    prevent_ad_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/prevent-ad.csv",
+            "variable",
+            "description",
+        )
+    )
+    prevent_ad_gpt.compute_embeddings(gpt4)
+
+    prevent_ad_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "PREVENT-AD", "CURIE"))
+    prevent_ad_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/prevent-ad.csv",
+            "variable",
+            "description",
+        )
+    )
+    prevent_ad_mpnet.compute_embeddings(mpnet)
+
+    vita_gpt = MappingTable(MappingSource(AD_CDM_SRC, "VITA", "CURIE"))
+    vita_gpt.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/vita.csv",
+            "Variable_Name",
+            "Element_description",
+        )
+    )
+    vita_gpt.compute_embeddings(gpt4)
+
+    vita_mpnet = MappingTable(MappingSource(AD_CDM_SRC, "VITA", "CURIE"))
+    vita_mpnet.add_descriptions(
+        DataDictionarySource(
+            "resources/dictionaries/ad/vita.csv",
+            "Variable_Name",
+            "Element_description",
+        )
+    )
+    vita_mpnet.compute_embeddings(mpnet)
 
     wmh_ad = MappingTable(MappingSource(AD_CDM_SRC, "VITA", "CURIE"))
 
-    ad_datasets = [a4, abvib, adni, aibl, arwibo, dod_adni, edsd, emif, i_adni, jadni, pharmacog, prevent_ad, vita,
-                   cdm_ad]
-    ad_datasets_labels = ["A4", "Abvib", "ADNI", "AIBL", "ARWIBO", "DOD-ADNI", "EDSD", "EMIF", "I-ADNI", "JADNI",
-                          "PharmaCog", "PREVENT-AD", "VITA", "AD-Mapper"]
-    gpt_table, fuzzy_table = evaluate(ad_datasets, ad_datasets_labels)
+    ad_datasets_gpt = [
+        a4_gpt,
+        abvib_gpt,
+        adni_gpt,
+        aibl_gpt,
+        arwibo_gpt,
+        dod_adni_gpt,
+        edsd_gpt,
+        emif_gpt,
+        i_adni_gpt,
+        jadni_gpt,
+        pharmacog_gpt,
+        prevent_ad_gpt,
+        vita_gpt,
+        cdm_ad_gpt,
+    ]
+
+    ad_datasets_mpnet = [
+        a4_mpnet,
+        abvib_mpnet,
+        adni_mpnet,
+        aibl_mpnet,
+        arwibo_mpnet,
+        dod_adni_mpnet,
+        edsd_mpnet,
+        emif_mpnet,
+        i_adni_mpnet,
+        jadni_mpnet,
+        pharmacog_mpnet,
+        prevent_ad_mpnet,
+        vita_mpnet,
+        cdm_ad_mpnet,
+    ]
+    ad_datasets_labels = [
+        "A4",
+        "ABVIB",
+        "ADNI",
+        "AIBL",
+        "ARWIBO",
+        "DOD-ADNI",
+        "EDSD",
+        "EMIF",
+        "I-ADNI",
+        "JADNI",
+        "PharmaCog",
+        "PREVENT-AD",
+        "VITA",
+        "AD-Mapper",
+    ]
+    gpt_table2, fuzzy_table2 = evaluate(
+        ad_datasets_gpt,
+        ad_datasets_labels,
+        store_results=True,
+        model="gpt",
+        results_root_dir="resources/results/ad",
+    )
+
+    mpnet_table2 = evaluate(
+        ad_datasets_mpnet,
+        ad_datasets_labels,
+        store_results=True,
+        model="mpnet",
+        results_root_dir="resources/results/ad",
+    )
 
     print("AD RESULTS:")
-    print(gpt_table.to_string())
+    print("GPT")
+    print("-----------")
+    print(gpt_table2.to_string())
+    print("-----------")
+    print("MPNet")
     print("-----------")
-    print(fuzzy_table.to_string())
+    print(mpnet_table2.to_string())
+    print("-----------")
+    print("Fuzzy")
+    print("-----------")
+    print(fuzzy_table2.to_string())
     print("-----------")
 
 # embedding distribution
-scatter_plot_two_distributions(pd_datesets, ad_datasets, "PD", "AD")
-scatter_plot_all_cohorts(pd_datesets, ad_datasets, pd_datasets_labels, ad_datasets_labels)
-
-
+scatter_plot_two_distributions(pd_datasets_gpt, ad_datasets_gpt, "PD", "AD")
+scatter_plot_all_cohorts(
+    pd_datasets_gpt, ad_datasets_gpt, pd_datasets_labels, ad_datasets_labels
+)
diff --git a/index/mapping.py b/index/mapping.py
index da3ba62..5f35013 100644
--- a/index/mapping.py
+++ b/index/mapping.py
@@ -2,16 +2,19 @@
 import numpy as np
 
 from index.embedding import EmbeddingModel
-from index.db.model import Terminology, Mapping, Concept, Variable
+from index.model import Terminology, Mapping, Concept, Variable
 from index.parsing import MappingSource, DataDictionarySource, EmbeddingSource
 
 
 class MappingTable:
 
-    def __init__(self, mapping_source: MappingSource,
-                 data_dictionary_source: DataDictionarySource = None,
-                 embedding_source: EmbeddingSource = None,
-                 terminology: Terminology = None):
+    def __init__(
+        self,
+        mapping_source: MappingSource,
+        data_dictionary_source: DataDictionarySource = None,
+        embedding_source: EmbeddingSource = None,
+        terminology: Terminology = None,
+    ):
         self.mapping_source: MappingSource = mapping_source
         self.data_dictionary_source: DataDictionarySource = data_dictionary_source
         self.embedding_source: EmbeddingSource = embedding_source
@@ -35,17 +38,23 @@ def add_descriptions(self, data_dictionary_source: DataDictionarySource):
         self.data_dictionary_source = data_dictionary_source
         data_dictionary_df = data_dictionary_source.to_dataframe()
         # FIXME: Join results in duplicate entries
-        self.joined_mapping_table = pd.merge(self.joined_mapping_table, data_dictionary_df,
-                                             left_on="variable",
-                                             right_on="variable",
-                                             how="left").drop_duplicates()
+        self.joined_mapping_table = pd.merge(
+            self.joined_mapping_table,
+            data_dictionary_df,
+            left_on="variable",
+            right_on="variable",
+            how="left",
+        ).drop_duplicates()
 
     def add_embeddings(self, embedding_source: EmbeddingSource):
         self.embedding_source = embedding_source
         # FIXME: Join results in duplicate entries
-        self.joined_mapping_table = pd.merge(self.joined_mapping_table, embedding_source.to_dataframe(),
-                                             left_on='description',
-                                             right_on="description")
+        self.joined_mapping_table = pd.merge(
+            self.joined_mapping_table,
+            embedding_source.to_dataframe(),
+            left_on="description",
+            right_on="description",
+        )
 
     def get_embeddings(self):
         if "embedding" not in self.joined_mapping_table.columns:
@@ -53,32 +62,46 @@ def get_embeddings(self):
         if "description" not in self.joined_mapping_table.columns:
             raise ValueError("No descriptions found in mapping table.")
         else:
-            return self.joined_mapping_table['embedding'].apply(np.array)
+            return self.joined_mapping_table["embedding"].apply(np.array)
 
     def get_embeddings_numpy(self):
-        return np.array(self.joined_mapping_table['embedding'].dropna().tolist())
+        return np.array(self.joined_mapping_table["embedding"].dropna().tolist())
 
     def save_embeddings(self, output_path: str):
         self.get_embeddings().to_csv(output_path, index=False)
         self.embedding_source = EmbeddingSource(output_path)
 
     def compute_embeddings(self, model: EmbeddingModel):
-        descriptions = self.joined_mapping_table['description'].dropna().unique().tolist()
+        descriptions = (
+            self.joined_mapping_table["description"].dropna().unique().tolist()
+        )
         embeddings = model.get_embeddings(descriptions)
-        embedding_df = pd.DataFrame({'description': descriptions, 'embedding': embeddings})
-        self.joined_mapping_table = pd.merge(self.joined_mapping_table, embedding_df,
-                                             left_on='description',
-                                             right_on='description',
-                                             how='left')
+        embedding_df = pd.DataFrame(
+            {"description": descriptions, "embedding": embeddings}
+        )
+        self.joined_mapping_table = pd.merge(
+            self.joined_mapping_table,
+            embedding_df,
+            left_on="description",
+            right_on="description",
+            how="left",
+        )
 
     def export_embeddings(self, output_path: str):
-        descriptions = self.joined_mapping_table['description'].dropna().unique().tolist()
-        embedding_df = pd.DataFrame({'description': descriptions, 'embedding': self.joined_mapping_table['embedding']})
+        descriptions = (
+            self.joined_mapping_table["description"].dropna().unique().tolist()
+        )
+        embedding_df = pd.DataFrame(
+            {
+                "description": descriptions,
+                "embedding": self.joined_mapping_table["embedding"],
+            }
+        )
         embedding_df.to_csv(output_path)
 
     def import_embeddings(self, input_path: str):
         embeddings = pd.read_csv(input_path)
-        self.joined_mapping_table['embedding'] = embeddings['embedding']
+        self.joined_mapping_table["embedding"] = embeddings["embedding"]
 
     def get_mapping_table(self) -> pd.DataFrame:
         return self.joined_mapping_table
@@ -89,14 +112,20 @@ def get_mappings(self) -> [Mapping]:
             concept_id = row["identifier"]
             variable_name = row["variable"]
             if self.data_dictionary_source is not None:
-                description = row['description']
+                description = row["description"]
             else:
                 description = None
             if not pd.isna(concept_id) and not pd.isna(variable_name):
                 concept = Concept(concept_id, self.terminology)
-                variable = Variable(variable_name, description,
-                                    self.data_dictionary_source.file_path
-                                    if self.data_dictionary_source is not None else None)
+                variable = Variable(
+                    variable_name,
+                    description,
+                    (
+                        self.data_dictionary_source.file_path
+                        if self.data_dictionary_source is not None
+                        else None
+                    ),
+                )
                 mapping = Mapping(concept, variable, self.mapping_source.file_path)
                 mappings.append(mapping)
         # remove duplicates
@@ -108,14 +137,20 @@ def to_mapping_dto(self) -> [Mapping]:
             concept_id = row["identifier"]
             variable_name = row["variable"]
             if self.data_dictionary_source is not None:
-                description = row['description']
+                description = row["description"]
             else:
                 description = None
             if not pd.isna(concept_id) and not pd.isna(variable_name):
                 concept = Concept(concept_id, self.terminology)
-                variable = Variable(variable_name, description,
-                                    self.data_dictionary_source.file_path
-                                    if self.data_dictionary_source is not None else None)
+                variable = Variable(
+                    variable_name,
+                    description,
+                    (
+                        self.data_dictionary_source.file_path
+                        if self.data_dictionary_source is not None
+                        else None
+                    ),
+                )
                 mapping = Mapping(concept, variable, self.mapping_source.file_path)
                 mappings.append(mapping)
         # remove duplicates
@@ -123,4 +158,4 @@ def to_mapping_dto(self) -> [Mapping]:
 
 
 def parse_float_array(s):
-    return [float(x) for x in s.strip('[]').split(',')]
\ No newline at end of file
+    return [float(x) for x in s.strip("[]").split(",")]
diff --git a/index/db/model.py b/index/model.py
similarity index 81%
rename from index/db/model.py
rename to index/model.py
index ab1608c..c30eca5 100644
--- a/index/db/model.py
+++ b/index/model.py
@@ -27,7 +27,9 @@ def to_dataframe(self):
 
 class Variable:
 
-    def __init__(self, name: str, description: str, source: str, embedding: Embedding = None):
+    def __init__(
+        self, name: str, description: str, source: str, embedding: Embedding = None
+    ):
         self.name = name
         self.description = description
         self.source = source
@@ -42,7 +44,10 @@ def __init__(self, concept: Concept, variable: Variable, source: str):
         self.source = source
 
     def __eq__(self, other):
-        return self.concept.identifier == other.concept.identifier and self.variable.name == other.variable.name
+        return (
+            self.concept.identifier == other.concept.identifier
+            and self.variable.name == other.variable.name
+        )
 
     def __hash__(self):
         return hash((self.concept.identifier, self.variable.name))
diff --git a/index/parsing.py b/index/parsing.py
index b63fbd7..1a25c39 100644
--- a/index/parsing.py
+++ b/index/parsing.py
@@ -5,35 +5,41 @@
 
 
 class Source(ABC):
-
     def __int__(self, file_path: str):
         self.file_path = file_path
 
     def to_dataframe(self) -> pd.DataFrame:
         # TODO: hardcoded for ad resources -> remove later
         if self.file_path.endswith("pharmacog.csv"):
-            return pd.read_csv(self.file_path, sep=' ')
+            return pd.read_csv(self.file_path, sep=" ")
         elif self.file_path.endswith("arwibo.csv"):
-            return pd.read_csv(self.file_path, sep=';', usecols=range(6), encoding='ISO-8859-1')
+            return pd.read_csv(
+                self.file_path, sep=";", usecols=range(6), encoding="ISO-8859-1"
+            )
         elif self.file_path.endswith("jadni.tsv"):
-            return pd.read_csv(self.file_path, sep='\t', encoding='ISO-8859-1')
+            return pd.read_csv(self.file_path, sep="\t", encoding="ISO-8859-1")
         elif self.file_path.endswith("vita.csv"):
-            return pd.read_csv(self.file_path, sep=',', encoding_errors='ignore')
+            return pd.read_csv(self.file_path, sep=",", encoding_errors="ignore")
         elif self.file_path.endswith("wmh-ad.csv"):
-            pd.read_csv(self.file_path, sep=',', encoding_errors='ignore')
+            pd.read_csv(self.file_path, sep=",", encoding_errors="ignore")
         elif self.file_path.endswith(".csv"):
             return pd.read_csv(self.file_path)
         # back to general encodings
         elif self.file_path.endswith(".tsv"):
-            return pd.read_csv(self.file_path, sep='\t')
+            return pd.read_csv(self.file_path, sep="\t")
         elif self.file_path.endswith(".xlsx"):
             xls = pd.ExcelFile(self.file_path)
-            dfs = [pd.read_excel(xls, sheet_name=sheet_name) for sheet_name in xls.sheet_names]
+            dfs = [
+                pd.read_excel(xls, sheet_name=sheet_name)
+                for sheet_name in xls.sheet_names
+            ]
             for df in dfs:
                 # Replace control sequences in string columns / headers & remove trailing whitespaces
-                df.columns = df.columns.str.replace('\r', '', regex=True).str.strip()
-                string_columns = df.select_dtypes(include=['object']).columns
-                df[string_columns] = df[string_columns].apply(lambda x: x.str.replace('\r', '').str.strip(), axis=1)
+                df.columns = df.columns.str.replace("\r", "", regex=True).str.strip()
+                string_columns = df.select_dtypes(include=["object"]).columns
+                df[string_columns] = df[string_columns].apply(
+                    lambda x: x.str.replace("\r", "").str.strip(), axis=1
+                )
                 combined_df = pd.concat(dfs, ignore_index=True)
             return combined_df
         else:
@@ -54,11 +60,20 @@ def to_dataframe(self) -> pd.DataFrame:
         df = super().to_dataframe()
         # sanity check
         if self.variable_field not in df.columns:
-            raise ValueError(f"Variable field {self.variable_field} not found in {self.file_path}")
+            raise ValueError(
+                f"Variable field {self.variable_field} not found in {self.file_path}"
+            )
         if self.identifier_field not in df.columns:
-            raise ValueError(f"Identifier field {self.identifier_field} not found in {self.file_path}")
+            raise ValueError(
+                f"Identifier field {self.identifier_field} not found in {self.file_path}"
+            )
         df = df[[self.variable_field, self.identifier_field]]
-        df = df.rename(columns={self.variable_field: "variable", self.identifier_field: "identifier"})
+        df = df.rename(
+            columns={
+                self.variable_field: "variable",
+                self.identifier_field: "identifier",
+            }
+        )
         df.dropna(subset=["variable", "identifier"], inplace=True)
         return df
 
@@ -77,17 +92,25 @@ def to_dataframe(self) -> pd.DataFrame:
         df = super().to_dataframe()
         # sanity check
         if self.variable_field not in df.columns:
-            raise ValueError(f"Variable field {self.variable_field} not found in {self.file_path}")
+            raise ValueError(
+                f"Variable field {self.variable_field} not found in {self.file_path}"
+            )
         if self.description_field not in df.columns:
-            raise ValueError(f"Description field {self.description_field} not found in {self.file_path}")
+            raise ValueError(
+                f"Description field {self.description_field} not found in {self.file_path}"
+            )
         df = df[[self.variable_field, self.description_field]]
-        df = df.rename(columns={self.variable_field: "variable", self.description_field: "description"})
+        df = df.rename(
+            columns={
+                self.variable_field: "variable",
+                self.description_field: "description",
+            }
+        )
         df.dropna(subset=["variable", "description"], inplace=True)
         return df
 
 
 class EmbeddingSource:
-
     def __init__(self, source_path: str):
         self.source_path = source_path
         self.description_field = "description"
@@ -106,11 +129,12 @@ def export(self, dst_path: str):
 
 
 def parse_float_array(s):
-    return [float(x) for x in s.strip('[]').split(',')]
+    return [float(x) for x in s.strip("[]").split(",")]
 
 
 class ConceptSource:
     """
     identifier -> description
     """
+
     pass
diff --git a/index/visualisation.py b/index/visualisation.py
index e641c15..f88d9fe 100644
--- a/index/visualisation.py
+++ b/index/visualisation.py
@@ -7,14 +7,15 @@
 import matplotlib.pyplot as plt
 from sklearn.manifold import TSNE
 import plotly.graph_objects as go
+import plotly.express as px
 
 from index.conf import COLORS_AD, COLORS_PD
 from index.mapping import MappingTable
 
 
 class PlotSide(Enum):
-    LEFT = 1,
-    RIGHT = 2,
+    LEFT = (1,)
+    RIGHT = (2,)
     BOTH = 3
 
 
@@ -30,25 +31,45 @@ def get_cohort_specific_color_code(cohort_name: str):
     elif cohort_name.lower() in COLORS_PD:
         return COLORS_PD[cohort_name.lower()]
     else:
-        print(f'No color code found for cohort {cohort_name}')
+        print(f"No color code found for cohort {cohort_name}")
         return None
 
 
-def enrichment_plot(acc_gpt, acc_fuzzy, title, save_plot=False, save_dir="resources/results/plots"):
-    if len(acc_gpt) != len(acc_fuzzy):
-        raise ValueError("acc_gpt and acc_fuzzy should be of the same length!")
-    data = {"Maximum Considered Rank": list(range(1, len(acc_gpt) + 1)), "GPT": acc_gpt,
-            "Fuzzy": acc_fuzzy}
+def enrichment_plot(
+    acc_gpt,
+    acc_mpnet,
+    acc_fuzzy,
+    title,
+    save_plot=False,
+    save_dir="resources/results/plots",
+):
+    if (
+        len(acc_gpt) != len(acc_fuzzy)
+        or len(acc_gpt) != len(acc_mpnet)
+        or len(acc_mpnet) != len(acc_fuzzy)
+    ):
+        raise ValueError(
+            "acc_gpt, acc_mpnet and acc_fuzzy should be of the same length!"
+        )
+    data = {
+        "Maximum Considered Rank": list(range(1, len(acc_gpt) + 1)),
+        "GPT": acc_gpt,
+        "MPNet": acc_mpnet,
+        "Fuzzy": acc_fuzzy,
+    }
     df = pd.DataFrame(data)
     sns.set(style="whitegrid")
     sns.lineplot(data=df, x="Maximum Considered Rank", y="GPT", label="GPT")
-    sns.lineplot(data=df, x="Maximum Considered Rank", y="Fuzzy", label="Fuzzy String Matching")
+    sns.lineplot(data=df, x="Maximum Considered Rank", y="MPNet", label="MPNet")
+    sns.lineplot(
+        data=df, x="Maximum Considered Rank", y="Fuzzy", label="Fuzzy String Matching"
+    )
     sns.set(style="whitegrid")
     plt.xlabel("Maximum Considered Rank")
     plt.ylabel("Accuracy")
     plt.xticks(range(1, len(acc_gpt) + 1), labels=range(1, len(acc_gpt) + 1))
     plt.yticks([i / 10 for i in range(11)])
-    plt.gca().set_yticklabels([f'{i:.1f}' for i in plt.gca().get_yticks()])
+    plt.gca().set_yticklabels([f"{i:.1f}" for i in plt.gca().get_yticks()])
     plt.title(title)
     plt.legend()
     if save_plot:
@@ -61,71 +82,222 @@ def concat_embeddings(tables1: [MappingTable], tables2: [MappingTable]):
     tables1_cleaned = [copy.deepcopy(table) for table in tables1]
     tables2_cleaned = [copy.deepcopy(table) for table in tables2]
     for table1, table2 in zip(tables1_cleaned, tables2_cleaned):
-        table1.joined_mapping_table.dropna(subset=['embedding', 'description'], inplace=True)
-        table2.joined_mapping_table.dropna(subset=['embedding', 'description'], inplace=True)
-    vectors_tables1 = np.concatenate([table.get_embeddings_numpy() for table in tables1_cleaned])
-    vectors_tables2 = np.concatenate([table.get_embeddings_numpy() for table in tables2_cleaned])
-    descriptions_table1 = np.concatenate([table.joined_mapping_table["description"] for table in tables1_cleaned])
-    descriptions_table2 = np.concatenate([table.joined_mapping_table["description"] for table in tables2_cleaned])
-    boundaries1 = np.array([table.joined_mapping_table["embedding"].index.size for table in tables1_cleaned])
-    boundaries2 = np.array([table.joined_mapping_table["embedding"].index.size for table in tables2_cleaned])
+        table1.joined_mapping_table.dropna(
+            subset=["embedding", "description"], inplace=True
+        )
+        table2.joined_mapping_table.dropna(
+            subset=["embedding", "description"], inplace=True
+        )
+    vectors_tables1 = np.concatenate(
+        [table.get_embeddings_numpy() for table in tables1_cleaned]
+    )
+    vectors_tables2 = np.concatenate(
+        [table.get_embeddings_numpy() for table in tables2_cleaned]
+    )
+    descriptions_table1 = np.concatenate(
+        [table.joined_mapping_table["description"] for table in tables1_cleaned]
+    )
+    descriptions_table2 = np.concatenate(
+        [table.joined_mapping_table["description"] for table in tables2_cleaned]
+    )
+    boundaries1 = np.array(
+        [
+            table.joined_mapping_table["embedding"].index.size
+            for table in tables1_cleaned
+        ]
+    )
+    boundaries2 = np.array(
+        [
+            table.joined_mapping_table["embedding"].index.size
+            for table in tables2_cleaned
+        ]
+    )
     vectors_concatenated = np.concatenate([vectors_tables1, vectors_tables2])
-    descriptions_concatenated = np.concatenate([descriptions_table1, descriptions_table2])
-    boundaries_concatenated = size_array_to_boundaries(np.concatenate([boundaries1, boundaries2]))
+    descriptions_concatenated = np.concatenate(
+        [descriptions_table1, descriptions_table2]
+    )
+    boundaries_concatenated = size_array_to_boundaries(
+        np.concatenate([boundaries1, boundaries2])
+    )
     return vectors_concatenated, descriptions_concatenated, boundaries_concatenated
 
 
-def scatter_plot_two_distributions(tables1: [MappingTable], tables2: [MappingTable], label1: str, label2: str,
-                                   store_html: bool = True,
-                                   store_destination: str = "resources/results/plots/ad_vs_pd.html"):
-    vectors_tables1 = np.concatenate([table.get_embeddings_numpy() for table in tables1])
-    vectors_tables2 = np.concatenate([table.get_embeddings_numpy() for table in tables2])
+def bar_chart_average_acc_two_distributions(
+    dist1_fuzzy: pd.DataFrame,
+    dist1_gpt: pd.DataFrame,
+    dist1_mpnet: pd.DataFrame,
+    dist2_fuzzy: pd.DataFrame,
+    dist2_gpt: pd.DataFrame,
+    dist2_mpnet: pd.DataFrame,
+    title: str,
+    label1: str,
+    label2: str,
+):
+    if not all(
+        dist.shape == fuzzy.shape == mpnet.shape
+        for dist, mpnet, fuzzy in [
+            (dist1_gpt, dist1_mpnet, dist1_fuzzy),
+            (dist2_gpt, dist2_mpnet, dist2_fuzzy),
+        ]
+    ):
+        raise ValueError(
+            "Each pair of dist and fuzzy DataFrames must have the same dimensions"
+        )
+    if not all(dist.shape[0] == dist.shape[1] for dist in [dist1_fuzzy, dist2_fuzzy]):
+        raise ValueError("Each dist DataFrame must be square")
+    if not all(
+        dist.index.equals(fuzzy.index) and dist.columns.equals(fuzzy.columns)
+        for dist, fuzzy in [(dist1_fuzzy, dist1_gpt), (dist2_fuzzy, dist2_gpt)]
+    ):
+        raise ValueError(
+            "All row and column labels within each pair of dist and fuzzy DataFrames must be equal"
+        )
+    # average value without the diagonal, since diagonal contains matching of the same pair
+    avg_acc_fuzzy1 = np.mean(
+        dist1_fuzzy.values[~np.eye(dist1_fuzzy.shape[0], dtype=bool)]
+    )
+    avg_acc_fuzzy2 = np.mean(
+        dist2_fuzzy.values[~np.eye(dist2_fuzzy.shape[0], dtype=bool)]
+    )
+    avg_acc_gpt1 = np.mean(dist1_gpt.values[~np.eye(dist1_gpt.shape[0], dtype=bool)])
+    avg_acc_gpt2 = np.mean(dist2_gpt.values[~np.eye(dist2_gpt.shape[0], dtype=bool)])
+    avg_acc_mpnet1 = np.mean(
+        dist1_mpnet.values[~np.eye(dist1_mpnet.shape[0], dtype=bool)]
+    )
+    avg_acc_mpnet2 = np.mean(
+        dist2_mpnet.values[~np.eye(dist2_mpnet.shape[0], dtype=bool)]
+    )
+    data = {
+        "Fuzzy String Matching": [avg_acc_fuzzy1, avg_acc_fuzzy2],
+        "GPT Embeddings": [avg_acc_gpt1, avg_acc_gpt2],
+        "MPNet Embeddings": [avg_acc_mpnet1, avg_acc_mpnet2],
+    }
+    df = pd.DataFrame(data, index=[label1, label2])
+    print(df)
+    df_melted = df.reset_index().melt(
+        id_vars="index", var_name="Method", value_name="Accuracy"
+    )
+    plt.figure(figsize=(10, 6))
+    sns.set(style="whitegrid")
+    sns.barplot(x="index", y="Accuracy", hue="Method", data=df_melted)
+    plt.xlabel("")
+    plt.ylabel("Average Accuracy")
+    plt.title(title)
+    plt.show()
+
+
+def scatter_plot_two_distributions(
+    tables1: [MappingTable],
+    tables2: [MappingTable],
+    label1: str,
+    label2: str,
+    store_html: bool = True,
+    legend_font_size: int = 16,
+    store_destination: str = "resources/results/plots/ad_vs_pd.html",
+):
+    vectors_tables1 = np.concatenate(
+        [table.get_embeddings_numpy() for table in tables1]
+    )
+    vectors_tables2 = np.concatenate(
+        [table.get_embeddings_numpy() for table in tables2]
+    )
     # remove entries that do not contain an embedding -> have no corresponding vector
-    [table.joined_mapping_table.dropna(subset=['embedding'], inplace=True) for table in tables1]
-    [table.joined_mapping_table.dropna(subset=['embedding'], inplace=True) for table in tables2]
+    [
+        table.joined_mapping_table.dropna(subset=["embedding"], inplace=True)
+        for table in tables1
+    ]
+    [
+        table.joined_mapping_table.dropna(subset=["embedding"], inplace=True)
+        for table in tables2
+    ]
     # get descriptions as interactive labels
-    labels_table1 = np.concatenate([table.joined_mapping_table["description"] for table in tables1])
-    labels_table2 = np.concatenate([table.joined_mapping_table["description"] for table in tables2])
+    labels_table1 = np.concatenate(
+        [table.joined_mapping_table["description"] for table in tables1]
+    )
+    labels_table2 = np.concatenate(
+        [table.joined_mapping_table["description"] for table in tables2]
+    )
     # boundary for concatenated vector
     class_boundary = len(vectors_tables1)
     vectors_concatenated = np.concatenate([vectors_tables1, vectors_tables2])
     tsne = TSNE(n_components=2, perplexity=30, random_state=42)
     tsne_result = tsne.fit_transform(vectors_concatenated)
     fig = go.Figure()
-    fig.add_trace(go.Scatter(x=tsne_result[:class_boundary, 0], y=tsne_result[:class_boundary, 1],
-                             mode="markers", name=label1, text=labels_table1))
-    fig.add_trace(go.Scatter(x=tsne_result[class_boundary:, 0], y=tsne_result[class_boundary:, 1],
-                             mode="markers", name=label2, text=labels_table2))
+    # bigger legend size
+    fig.update_layout(legend=dict(font=dict(size=legend_font_size)))
+    fig.add_trace(
+        go.Scatter(
+            x=tsne_result[:class_boundary, 0],
+            y=tsne_result[:class_boundary, 1],
+            mode="markers",
+            name=label1,
+            text=labels_table1,
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=tsne_result[class_boundary:, 0],
+            y=tsne_result[class_boundary:, 1],
+            mode="markers",
+            name=label2,
+            text=labels_table2,
+        )
+    )
     fig.show()
     if store_html:
         fig.write_html(store_destination)
 
 
-def scatter_plot_all_cohorts(tables1: [MappingTable], tables2: [MappingTable], labels1: [str], labels2: [str],
-                             plot_side: PlotSide = PlotSide.BOTH, store_html: bool = True,
-                             store_base_dir: str = "resources/results/plots"):
+def scatter_plot_all_cohorts(
+    tables1: [MappingTable],
+    tables2: [MappingTable],
+    labels1: [str],
+    labels2: [str],
+    plot_side: PlotSide = PlotSide.BOTH,
+    store_html: bool = True,
+    legend_font_size: int = 16,
+    store_base_dir: str = "resources/results/plots",
+):
     if not len(tables1) == len(labels1) or not len(tables2) == len(labels2):
         raise ValueError("Length of corresponding tables and labels must be equal!")
     tables_boundary = len(tables1)
     vectors, descriptions, boundaries = concat_embeddings(tables1, tables2)
-    tsne = TSNE(n_components=2, perplexity=(30 if len(vectors) > 30 else len(vectors) - 1), random_state=42)
+    tsne = TSNE(
+        n_components=2,
+        perplexity=(30 if len(vectors) > 30 else len(vectors) - 1),
+        random_state=42,
+    )
     tsne_result = tsne.fit_transform(vectors)
+    # more distinct colors
+    color_scale = px.colors.qualitative.Set3
     fig = go.Figure()
+    # bigger legend size
+    fig.update_layout(legend=dict(font=dict(size=legend_font_size)))
     # first cohort is from 0 to x
     boundaries = np.insert(boundaries, 0, 0)
     for idx in range(len(tables1)):
         if labels1[idx]:
-            fig.add_trace(go.Scatter(x=tsne_result[boundaries[idx]:boundaries[idx + 1], 0],
-                                     y=tsne_result[boundaries[idx]:boundaries[idx + 1], 1],
-                                     mode="markers", name=labels1[idx],
-                                     text=descriptions[boundaries[idx]:boundaries[idx + 1]],
-                                     line=dict(color=get_cohort_specific_color_code(labels1[idx]))))
+            fig.add_trace(
+                go.Scatter(
+                    x=tsne_result[boundaries[idx] : boundaries[idx + 1], 0],
+                    y=tsne_result[boundaries[idx] : boundaries[idx + 1], 1],
+                    mode="markers",
+                    name=labels1[idx],
+                    text=descriptions[boundaries[idx] : boundaries[idx + 1]],
+                    # line=dict(color=get_cohort_specific_color_code(labels1[idx]))
+                )
+            )
     for idy in range(len(tables1), len(boundaries) - 1):
-        fig.add_trace(go.Scatter(x=tsne_result[boundaries[idy]:boundaries[idy + 1], 0],
-                                 y=tsne_result[boundaries[idy]:boundaries[idy + 1], 1],
-                                 mode="markers", name=labels2[idy - len(tables1)],
-                                 text=descriptions[boundaries[idy]:boundaries[idy + 1]],
-                                 line=dict(color=get_cohort_specific_color_code(labels2[idy - len(tables1)]))))
+        fig.add_trace(
+            go.Scatter(
+                x=tsne_result[boundaries[idy] : boundaries[idy + 1], 0],
+                y=tsne_result[boundaries[idy] : boundaries[idy + 1], 1],
+                mode="markers",
+                name=labels2[idy - len(tables1)],
+                text=descriptions[boundaries[idy] : boundaries[idy + 1]],
+                # line=dict(color=get_cohort_specific_color_code(labels2[idy - len(tables1)]))
+            )
+        )
     if store_html:
         fig.write_html(store_base_dir + "/tsne_all_cohorts.html")
     fig.show()
diff --git a/requirements.txt b/requirements.txt
index 7b21b4c..0610d8e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,17 +1,18 @@
+matplotlib~=3.8.1
 numpy==1.25.2
+openai~=0.28.0
+openpyxl
 pandas==2.1.0
 pip==21.3.1
+plotly~=5.17.0
 python-dateutil==2.8.2
+python-dotenv~=1.0.0
 pytz==2023.3
+seaborn~=0.13.0
+sentence-transformers==2.3.1
 setuptools==60.2.0
+scikit-learn==1.3.2
 six==1.16.0
-tzdata==2023.3
-wheel==0.37.1
-openpyxl
-openai~=0.28.0
-scikit-learn~=1.3.0
-plotly~=5.17.0
-python-dotenv~=1.0.0
 thefuzz~=0.20.0
-matplotlib~=3.8.1
-seaborn~=0.13.0
\ No newline at end of file
+tzdata==2023.3
+wheel==0.37.1
\ No newline at end of file
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
index 143dbf4..1f0a03e 100644
--- a/tests/test_evaluation.py
+++ b/tests/test_evaluation.py
@@ -3,7 +3,12 @@
 
 import numpy as np
 
-from index.evaluation import match_closest_descriptions, MatchingMethod, enrichment_analysis, score_mappings
+from index.evaluation import (
+    match_closest_descriptions,
+    MatchingMethod,
+    enrichment_analysis,
+    score_mappings,
+)
 from index.mapping import MappingTable
 from index.parsing import MappingSource, DataDictionarySource
 
@@ -12,19 +17,46 @@ class Test(TestCase):
 
     TEST_DIR_PATH = os.path.dirname(os.path.realpath(__file__))
 
-    mapping_source = MappingSource(os.path.join(TEST_DIR_PATH, "resources", 'test_mapping.xlsx'), "VAR_1", "ID_1")
-    data_dictionary_source = DataDictionarySource(os.path.join(TEST_DIR_PATH, "resources", 'test_data_dict.csv'),
-                                                  "VAR_1", "DESC")
+    mapping_source = MappingSource(
+        os.path.join(TEST_DIR_PATH, "resources", "test_mapping.xlsx"), "VAR_1", "ID_1"
+    )
+    data_dictionary_source = DataDictionarySource(
+        os.path.join(TEST_DIR_PATH, "resources", "test_data_dict.csv"), "VAR_1", "DESC"
+    )
 
     def test_match_closest_descriptions_embeddings(self):
         mapping_table1 = MappingTable(self.mapping_source, self.data_dictionary_source)
         mapping_table2 = MappingTable(self.mapping_source, self.data_dictionary_source)
         # make the second mapping table shorter to test the case where there are more descriptions in the first
-        mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[:-2]
-        embeddings1 = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], np.nan, np.nan, [8, 8], [9, 9], np.nan]
-        embeddings2 = [[0, 0], np.nan, [9, 9], [3, 3], [7, 7], [5.1, 5.1], [5, 5], [4, 4], np.nan]
-        mapping_table1.joined_mapping_table['embedding'] = embeddings1
-        mapping_table2.joined_mapping_table['embedding'] = embeddings2
+        mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[
+            :-2
+        ]
+        embeddings1 = [
+            [0, 0],
+            [1, 1],
+            [2, 2],
+            [3, 3],
+            [4, 4],
+            [5, 5],
+            np.nan,
+            np.nan,
+            [8, 8],
+            [9, 9],
+            np.nan,
+        ]
+        embeddings2 = [
+            [0, 0],
+            np.nan,
+            [9, 9],
+            [3, 3],
+            [7, 7],
+            [5.1, 5.1],
+            [5, 5],
+            [4, 4],
+            np.nan,
+        ]
+        mapping_table1.joined_mapping_table["embedding"] = embeddings1
+        mapping_table2.joined_mapping_table["embedding"] = embeddings2
         result = match_closest_descriptions(mapping_table1, mapping_table2)
         self.assertEqual(3, result["correct"].sum())
 
@@ -32,34 +64,87 @@ def test_score_mappings(self):
         mapping_table1 = MappingTable(self.mapping_source, self.data_dictionary_source)
         mapping_table2 = MappingTable(self.mapping_source, self.data_dictionary_source)
         # make the second mapping table shorter to test the case where there are more descriptions in the first
-        mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[:-2]
-        embeddings1 = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], np.nan, np.nan, [8, 8], [9, 9], np.nan]
-        embeddings2 = [[0, 0], np.nan, [9, 9], [3, 3], [7, 7], [5.1, 5.1], [5, 5], [4, 4], np.nan]
-        mapping_table1.joined_mapping_table['embedding'] = embeddings1
-        mapping_table2.joined_mapping_table['embedding'] = embeddings2
+        mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[
+            :-2
+        ]
+        embeddings1 = [
+            [0, 0],
+            [1, 1],
+            [2, 2],
+            [3, 3],
+            [4, 4],
+            [5, 5],
+            np.nan,
+            np.nan,
+            [8, 8],
+            [9, 9],
+            np.nan,
+        ]
+        embeddings2 = [
+            [0, 0],
+            np.nan,
+            [9, 9],
+            [3, 3],
+            [7, 7],
+            [5.1, 5.1],
+            [5, 5],
+            [4, 4],
+            np.nan,
+        ]
+        mapping_table1.joined_mapping_table["embedding"] = embeddings1
+        mapping_table2.joined_mapping_table["embedding"] = embeddings2
         # 2 should be correct out of a total of 4 valid mappings (possible matches, no nan)
         result = match_closest_descriptions(mapping_table1, mapping_table2)
         acc = score_mappings(result)
-        self.assertEqual(3/5, acc)
+        self.assertEqual(3 / 5, acc)
 
     def test_match_closest_description_fuzzy(self):
         mapping_table1 = MappingTable(self.mapping_source, self.data_dictionary_source)
         mapping_table2 = MappingTable(self.mapping_source, self.data_dictionary_source)
         # make the second mapping table shorter to test the case where there are more descriptions in the first
-        mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[:-2]
-        result = match_closest_descriptions(mapping_table1, mapping_table2,
-                                            matching_method=MatchingMethod.FUZZY_STRING_MATCHING)
+        mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[
+            :-2
+        ]
+        result = match_closest_descriptions(
+            mapping_table1,
+            mapping_table2,
+            matching_method=MatchingMethod.FUZZY_STRING_MATCHING,
+        )
         self.assertEqual(7, result["correct"].sum())
 
     def test_enrichment_analysis_embeddings(self):
         mapping_table1 = MappingTable(self.mapping_source, self.data_dictionary_source)
         mapping_table2 = MappingTable(self.mapping_source, self.data_dictionary_source)
         # make the second mapping table shorter to test the case where there are more descriptions in the first
-        mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[:-2]
-        embeddings1 = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], np.nan, np.nan, [8, 8], [9, 9], np.nan]
-        embeddings2 = [[0, 0], np.nan, [9, 9], [3, 3], [7, 7], [5.1, 5.1], [5, 5], [4, 4], np.nan]
-        mapping_table1.joined_mapping_table['embedding'] = embeddings1
-        mapping_table2.joined_mapping_table['embedding'] = embeddings2
+        mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[
+            :-2
+        ]
+        embeddings1 = [
+            [0, 0],
+            [1, 1],
+            [2, 2],
+            [3, 3],
+            [4, 4],
+            [5, 5],
+            np.nan,
+            np.nan,
+            [8, 8],
+            [9, 9],
+            np.nan,
+        ]
+        embeddings2 = [
+            [0, 0],
+            np.nan,
+            [9, 9],
+            [3, 3],
+            [7, 7],
+            [5.1, 5.1],
+            [5, 5],
+            [4, 4],
+            np.nan,
+        ]
+        mapping_table1.joined_mapping_table["embedding"] = embeddings1
+        mapping_table2.joined_mapping_table["embedding"] = embeddings2
         result = enrichment_analysis(mapping_table1, mapping_table2, 5)
         self.assertListEqual([3 / 5, 3 / 5, 4 / 5, 4 / 5, 1], result.tolist())
 
@@ -67,7 +152,13 @@ def test_enrichment_analysis_fuzzy(self):
         mapping_table1 = MappingTable(self.mapping_source, self.data_dictionary_source)
         mapping_table2 = MappingTable(self.mapping_source, self.data_dictionary_source)
         # make the second mapping table shorter to test the case where there are more descriptions in the first
-        mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[:-2]
-        result = enrichment_analysis(mapping_table1, mapping_table2, 5,
-                                     matching_method=MatchingMethod.FUZZY_STRING_MATCHING)
+        mapping_table2.joined_mapping_table = mapping_table2.joined_mapping_table.iloc[
+            :-2
+        ]
+        result = enrichment_analysis(
+            mapping_table1,
+            mapping_table2,
+            5,
+            matching_method=MatchingMethod.FUZZY_STRING_MATCHING,
+        )
         self.assertListEqual([1, 1, 1, 1, 1], result.tolist())
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 14555a1..417e850 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -8,9 +8,12 @@
 class Test(TestCase):
     TEST_DIR_PATH = os.path.dirname(os.path.realpath(__file__))
 
-    mapping_source = MappingSource(os.path.join(TEST_DIR_PATH, "resources", 'test_mapping.xlsx'), "VAR_1", "ID_1")
-    data_dictionary_source = DataDictionarySource(os.path.join(TEST_DIR_PATH, "resources", 'test_data_dict.csv'),
-                                                  "VAR_1", "DESC")
+    mapping_source = MappingSource(
+        os.path.join(TEST_DIR_PATH, "resources", "test_mapping.xlsx"), "VAR_1", "ID_1"
+    )
+    data_dictionary_source = DataDictionarySource(
+        os.path.join(TEST_DIR_PATH, "resources", "test_data_dict.csv"), "VAR_1", "DESC"
+    )
 
     def test_parse(self):
         mapping_table = MappingTable(self.mapping_source, self.data_dictionary_source)
@@ -34,8 +37,14 @@ def test_parse_add_description_later(self):
     def test_parse_data_dict_excel(self):
         mapping_table = MappingTable(self.mapping_source)
         data_dictionary_source = DataDictionarySource(
-            os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources", 'test_data_dict.xlsx'),
-            "VAR_1", "DESC")
+            os.path.join(
+                os.path.dirname(os.path.realpath(__file__)),
+                "resources",
+                "test_data_dict.xlsx",
+            ),
+            "VAR_1",
+            "DESC",
+        )
         mapping_table.add_descriptions(data_dictionary_source)
         mappings = mapping_table.get_mappings()
         self.assertEqual(11, len(mappings))
diff --git a/tests/test_visualisation.py b/tests/test_visualisation.py
index 0c74b8c..b3bd61b 100644
--- a/tests/test_visualisation.py
+++ b/tests/test_visualisation.py
@@ -2,18 +2,28 @@
 from unittest import TestCase
 
 import numpy as np
+import pandas as pd
 
+from index.evaluation import evaluate
 from index.mapping import MappingTable
 from index.parsing import MappingSource, DataDictionarySource
-from index.visualisation import scatter_plot_two_distributions, enrichment_plot, scatter_plot_all_cohorts
+from index.visualisation import (
+    scatter_plot_two_distributions,
+    enrichment_plot,
+    scatter_plot_all_cohorts,
+    bar_chart_average_acc_two_distributions,
+)
 
 
 class Test(TestCase):
     TEST_DIR_PATH = os.path.dirname(os.path.realpath(__file__))
 
-    mapping_source = MappingSource(os.path.join(TEST_DIR_PATH, "resources", 'test_mapping.xlsx'), "VAR_1", "ID_1")
-    data_dictionary_source = DataDictionarySource(os.path.join(TEST_DIR_PATH, "resources", 'test_data_dict.csv'),
-                                                  "VAR_1", "DESC")
+    mapping_source = MappingSource(
+        os.path.join(TEST_DIR_PATH, "resources", "test_mapping.xlsx"), "VAR_1", "ID_1"
+    )
+    data_dictionary_source = DataDictionarySource(
+        os.path.join(TEST_DIR_PATH, "resources", "test_data_dict.csv"), "VAR_1", "DESC"
+    )
 
     embeddings1 = [
         [1.1, 2.2, 3.3],
@@ -26,7 +36,7 @@ class Test(TestCase):
         [22.4, 23.5, 24.6],
         [25.7, 26.8, 27.9],
         [28.1, 29.2, 30.3],
-        [31.4, 32.5, 33.6]
+        [31.4, 32.5, 33.6],
     ]
 
     embeddings2 = [
@@ -40,7 +50,7 @@ class Test(TestCase):
         [23.4, 24.5, 25.6],
         [26.7, 27.8, 28.9],
         np.nan,
-        [32.4, 33.5, 34.6]
+        [32.4, 33.5, 34.6],
     ]
 
     embeddings3 = [
@@ -54,7 +64,7 @@ class Test(TestCase):
         [24.4, 25.5, 26.6],
         [27.7, 28.8, 29.9],
         [30.1, 31.2, 32.3],
-        [33.4, 34.5, 35.6]
+        [33.4, 34.5, 35.6],
     ]
 
     embeddings4 = [
@@ -68,7 +78,7 @@ class Test(TestCase):
         [25.4, 26.5, 27.6],
         [28.7, 29.8, 30.9],
         [31.1, 32.2, 33.3],
-        np.nan
+        np.nan,
     ]
 
     def test_scatter_plot_two_distributions(self):
@@ -81,12 +91,17 @@ def test_scatter_plot_two_distributions(self):
         mapping_table4 = MappingTable(self.mapping_source)
         mapping_table4.add_descriptions(self.data_dictionary_source)
 
-        mapping_table1.joined_mapping_table['embedding'] = self.embeddings1
-        mapping_table2.joined_mapping_table['embedding'] = self.embeddings2
-        mapping_table3.joined_mapping_table['embedding'] = self.embeddings3
-        mapping_table4.joined_mapping_table['embedding'] = self.embeddings4
-        scatter_plot_two_distributions([mapping_table1, mapping_table2], [mapping_table3, mapping_table4], "A", "B",
-                                       store_html=False)
+        mapping_table1.joined_mapping_table["embedding"] = self.embeddings1
+        mapping_table2.joined_mapping_table["embedding"] = self.embeddings2
+        mapping_table3.joined_mapping_table["embedding"] = self.embeddings3
+        mapping_table4.joined_mapping_table["embedding"] = self.embeddings4
+        scatter_plot_two_distributions(
+            [mapping_table1, mapping_table2],
+            [mapping_table3, mapping_table4],
+            "A",
+            "B",
+            store_html=False,
+        )
 
     def test_scatter_plot_all_cohorts(self):
         mapping_table1 = MappingTable(self.mapping_source)
@@ -98,15 +113,52 @@ def test_scatter_plot_all_cohorts(self):
         mapping_table4 = MappingTable(self.mapping_source)
         mapping_table4.add_descriptions(self.data_dictionary_source)
 
-        mapping_table1.joined_mapping_table['embedding'] = self.embeddings1
-        mapping_table2.joined_mapping_table['embedding'] = self.embeddings2
-        mapping_table3.joined_mapping_table['embedding'] = self.embeddings3
-        mapping_table4.joined_mapping_table['embedding'] = self.embeddings4
-        scatter_plot_all_cohorts([mapping_table1, mapping_table2], [mapping_table3, mapping_table4],
-                                 ["A1", "A2"], ["B1", "B2"], store_html=False)
+        mapping_table1.joined_mapping_table["embedding"] = self.embeddings1
+        mapping_table2.joined_mapping_table["embedding"] = self.embeddings2
+        mapping_table3.joined_mapping_table["embedding"] = self.embeddings3
+        mapping_table4.joined_mapping_table["embedding"] = self.embeddings4
+        scatter_plot_all_cohorts(
+            [mapping_table1, mapping_table2],
+            [mapping_table3, mapping_table4],
+            ["A1", "A2"],
+            ["B1", "B2"],
+            store_html=False,
+        )
 
     def test_enrichment_plot(self):
         acc_gpt = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.0, 1.0, 1.0]
+        acc_mpnet = [0.3, 0.4, 0.5, 0.6, 0.8, 0.9, 1.0, 1.0, 1.0, 1.0]
         acc_fuzzy = [0.2, 0.3, 0.4, 0.5, 0.8, 0.9, 1.0, 1.0, 1.0, 1.0]
         title = "Test"
-        enrichment_plot(acc_gpt, acc_fuzzy, title, save_plot=False)
+        enrichment_plot(acc_gpt, acc_mpnet, acc_fuzzy, title, save_plot=False)
+
+    def test_bar_chart_average_acc_two_distributions(self):
+        labels = ["M1", "M2", "M3"]
+        fuzzy_1 = pd.DataFrame(
+            {"M1": [1, 0.2, 0.23], "M2": [0.3, 1, 0.16], "M3": [0.27, 0.22, 1]},
+            index=labels,
+        ).T
+        fuzzy_2 = pd.DataFrame(
+            {"M1": [1, 0.19, 0.21], "M2": [0.29, 1, 0.18], "M3": [0.29, 0.21, 1]},
+            index=labels,
+        ).T
+        gpt_1 = pd.DataFrame(
+            {"M1": [1, 0.9, 0.78], "M2": [0.8, 1, 0.78], "M3": [0.82, 0.89, 1]},
+            index=labels,
+        ).T
+        gpt_2 = pd.DataFrame(
+            {"M1": [1, 0.88, 0.78], "M2": [0.79, 1, 0.78], "M3": [0.81, 0.85, 1]},
+            index=labels,
+        ).T
+        mpnet_1 = pd.DataFrame(
+            {"M1": [1, 0.8, 0.7], "M2": [0.7, 0.9, 0.68], "M3": [0.72, 0.79, 0.9]},
+            index=labels,
+        ).T
+        mpnet_2 = pd.DataFrame(
+            {"M1": [0.9, 0.78, 0.68], "M2": [0.69, 0.9, 0.68], "M3": [0.71, 0.75, 0.9]},
+            index=labels,
+        ).T
+
+        bar_chart_average_acc_two_distributions(
+            fuzzy_1, gpt_1, mpnet_1, fuzzy_2, gpt_2, mpnet_2, "title", "AD", "PD"
+        )

From 041c4caccb6e7bb13b714cb51f9c2c9ea06075db Mon Sep 17 00:00:00 2001
From: Mehmet Can Ay <mehmettt.can.ay@gmail.com>
Date: Wed, 14 Feb 2024 16:44:12 +0100
Subject: [PATCH 2/3] fix: formatting

---
 index/embedding.py     |   4 +-
 index/evaluation.py    | 126 ++++++++++-------------------------------
 index/main.py          |  20 ++++++-
 index/mapping.py       |   2 +-
 index/model.py         |  56 ------------------
 index/visualisation.py |  56 +++++-------------
 6 files changed, 63 insertions(+), 201 deletions(-)
 delete mode 100644 index/model.py

diff --git a/index/embedding.py b/index/embedding.py
index be42cc3..c7b4639 100644
--- a/index/embedding.py
+++ b/index/embedding.py
@@ -27,9 +27,7 @@ def get_embedding(self, text: str, model="text-embedding-ada-002"):
                 return None
             if isinstance(text, str):
                 text = text.replace("\n", " ")
-            return openai.Embedding.create(input=[text], model=model)["data"][0][
-                "embedding"
-            ]
+            return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]
         except Exception as e:
             logging.error(f"Error getting embedding for {text}: {e}")
             return None
diff --git a/index/evaluation.py b/index/evaluation.py
index 8c9160b..b5c6925 100644
--- a/index/evaluation.py
+++ b/index/evaluation.py
@@ -36,26 +36,16 @@ def enrichment_analysis(
     # not every variable can be matched
     max_matches = 0
     # clean up source and target table (missing embeddings, descriptions etc.)
-    source_table.joined_mapping_table.drop_duplicates(
-        subset=["variable"], keep="first", inplace=True
-    )
+    source_table.joined_mapping_table.drop_duplicates(subset=["variable"], keep="first", inplace=True)
     source_table.joined_mapping_table.dropna(subset=["description"], inplace=True)
     target_table.joined_mapping_table.dropna(subset=["description"], inplace=True)
-    if (
-        matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE
-        or matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE
-    ):
+    if (matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE or matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE):
         source_table.joined_mapping_table.dropna(subset=["embedding"], inplace=True)
         target_table.joined_mapping_table.dropna(subset=["embedding"], inplace=True)
     # re-index to account for dropped rows
-    target_table.joined_mapping_table = target_table.joined_mapping_table.reset_index(
-        drop=True
-    )
+    target_table.joined_mapping_table = target_table.joined_mapping_table.reset_index(drop=True)
     for idx, source_table_row in source_table.joined_mapping_table.iterrows():
-        correct_target_index = target_table.joined_mapping_table[
-            target_table.joined_mapping_table["identifier"]
-            == source_table_row["identifier"]
-        ].index
+        correct_target_index = target_table.joined_mapping_table[target_table.joined_mapping_table["identifier"] == source_table_row["identifier"]].index
         if len(correct_target_index) == 0:
             # can not be matched -> skip
             continue
@@ -67,37 +57,21 @@ def enrichment_analysis(
             if matching_method == MatchingMethod.EUCLIDEAN_EMBEDDING_DISTANCE:
                 source_table_embedding = source_table_row["embedding"]
                 target_table_embedding = target_table_row["embedding"]
-                distances.append(
-                    np.linalg.norm(
-                        np.array(source_table_embedding)
-                        - np.array(target_table_embedding)
-                    )
-                )
+                distances.append(np.linalg.norm(np.array(source_table_embedding) - np.array(target_table_embedding)))
             elif matching_method == MatchingMethod.COSINE_EMBEDDING_DISTANCE:
                 source_table_embedding = np.array(source_table_row["embedding"])
                 target_table_embedding = np.array(target_table_row["embedding"])
-                distances.append(
-                    distance.cosine(source_table_embedding, target_table_embedding)
-                )
+                distances.append(distance.cosine(source_table_embedding, target_table_embedding))
             elif matching_method == MatchingMethod.FUZZY_STRING_MATCHING:
                 source_table_description = source_table_row["description"]
                 target_table_description = target_table_row["description"]
-                distances.append(
-                    100 - fuzz.ratio(source_table_description, target_table_description)
-                )
+                distances.append(100 - fuzz.ratio(source_table_description, target_table_description))
             else:
-                raise NotImplementedError(
-                    "Specified matching method is not implemented!"
-                )
-        min_distance_indices = np.argsort(np.array(distances))[
-            :max_cumulative_match_rank
-        ]
+                raise NotImplementedError("Specified matching method is not implemented!")
+        min_distance_indices = np.argsort(np.array(distances))[:max_cumulative_match_rank]
         for n in range(max_cumulative_match_rank):
             # (due to upper level concepts) there may be more than one correct mapping
-            if any(
-                element in min_distance_indices[: n + 1]
-                for element in correct_target_index
-            ):
+            if any(element in min_distance_indices[: n + 1] for element in correct_target_index):
                 correct_matches[n] += 1
     return (correct_matches / max_matches).round(2)
 
@@ -118,9 +92,7 @@ def match_closest_descriptions(
     """
     # sometimes the same concept gets mapped against multiple concepts in CDM, resulting in artifacts in the results
     # -> drop duplicates, only keep first
-    source_table.joined_mapping_table.drop_duplicates(
-        subset=["variable"], keep="first", inplace=True
-    )
+    source_table.joined_mapping_table.drop_duplicates(subset=["variable"], keep="first", inplace=True)
     # remove rows from source and target that do not contain either a description (in general) or embedding (for gpt)
     source_table.joined_mapping_table.dropna(subset=["description"], inplace=True)
     target_table.joined_mapping_table.dropna(subset=["description"], inplace=True)
@@ -280,60 +252,22 @@ def evaluate(
     model="gpt",
     results_root_dir="resources/results/pd",
 ):
-
-    if model == "gpt":
-        data_gpt = {}
-        data_fuzzy = {}
-        for idx, source in enumerate(datasets):
-            acc_gpt = []
-            acc_fuzzy = []
-            for idy, target in enumerate(datasets):
-                map_gpt = match_closest_descriptions(source, target)
-                map_fuzzy = match_closest_descriptions(
-                    source, target, matching_method=MatchingMethod.FUZZY_STRING_MATCHING
-                )
-                if target == "jadni":
-                    print("check")
-                if store_results:
-                    map_gpt.to_excel(
-                        results_root_dir
-                        + "/gpt_"
-                        + f"{labels[idx]}_to_{labels[idy]}.xlsx"
-                    )
-                    map_fuzzy.to_excel(
-                        results_root_dir
-                        + "/fuzzy_"
-                        + f"{labels[idx]}_to_{labels[idy]}.xlsx"
-                    )
-                acc_gpt.append(round(score_mappings(map_gpt), 2))
-                acc_fuzzy.append(round(score_mappings(map_fuzzy), 2))
-            data_gpt[labels[idx]] = acc_gpt
-            data_fuzzy[labels[idx]] = acc_fuzzy
-        # transpose to have from -> to | row -> column like in the paper
-        gpt = pd.DataFrame(data_gpt, index=labels).T
-        fuzzy = pd.DataFrame(data_fuzzy, index=labels).T
-        return gpt, fuzzy
-
-    elif model == "mpnet":
-        data_mpnet = {}
-        for idx, source in enumerate(datasets):
-            acc_mpnet = []
-            for idy, target in enumerate(datasets):
-                map_mpnet = match_closest_descriptions(
-                    source,
-                    target,
-                    matching_method=MatchingMethod.COSINE_EMBEDDING_DISTANCE,
-                )
-                if target == "jadni":
-                    print("check")
-                if store_results:
-                    map_mpnet.to_excel(
-                        results_root_dir
-                        + "/mpnet_"
-                        + f"{labels[idx]}_to_{labels[idy]}.xlsx"
-                    )
-                acc_mpnet.append(round(score_mappings(map_mpnet), 2))
-            data_mpnet[labels[idx]] = acc_mpnet
-        # transpose to have from -> to | row -> column like in the paper
-        mpnet = pd.DataFrame(data_mpnet, index=labels).T
-        return mpnet
+    data = {}
+    for idx, source in enumerate(datasets):
+        acc = []
+        for idy, target in enumerate(datasets):
+            if model == "gpt":
+                map = match_closest_descriptions(source, target)
+            elif model == "mpnet":
+                map = match_closest_descriptions(source,target, matching_method=MatchingMethod.COSINE_EMBEDDING_DISTANCE)
+            elif model == "fuzzy":
+                map = match_closest_descriptions(source, target, matching_method=MatchingMethod.FUZZY_STRING_MATCHING)
+            else:
+                raise NotImplementedError("Specified model is not implemented!")
+            if store_results:
+                map.to_excel(results_root_dir + f"/{model}_" + f"{labels[idx]}_to_{labels[idy]}.xlsx")
+            acc.append(round(score_mappings(map), 2))
+        data[labels[idx]] = acc
+    # transpose to have from -> to | row -> column like in the paper
+    model_output = pd.DataFrame(data, index=labels).T
+    return model_output
diff --git a/index/main.py b/index/main.py
index 08dbe96..d36eb54 100644
--- a/index/main.py
+++ b/index/main.py
@@ -236,7 +236,7 @@
     print(ppmi_luxpark_enrichment_mpnet)
     print(ppmi_luxpark_enrichment_fuzzy)
 
-    gpt_table1, fuzzy_table1 = evaluate(
+    gpt_table1 = evaluate(
         pd_datasets_gpt,
         pd_datasets_labels,
         store_results=True,
@@ -244,6 +244,14 @@
         results_root_dir="./resources/results/pd",
     )
 
+    fuzzy_table1 = evaluate(
+        pd_datasets_gpt,
+        pd_datasets_labels,
+        store_results=True,
+        model="fuzzy",
+        results_root_dir="./resources/results/pd",
+    )
+
     mpnet_table1 = evaluate(
         pd_datasets_mpnet,
         pd_datasets_labels,
@@ -568,7 +576,7 @@
         "VITA",
         "AD-Mapper",
     ]
-    gpt_table2, fuzzy_table2 = evaluate(
+    gpt_table2 = evaluate(
         ad_datasets_gpt,
         ad_datasets_labels,
         store_results=True,
@@ -576,6 +584,14 @@
         results_root_dir="resources/results/ad",
     )
 
+    fuzzy_table2 = evaluate(
+        ad_datasets_gpt,
+        ad_datasets_labels,
+        store_results=True,
+        model="fuzzy",
+        results_root_dir="resources/results/ad",
+    )
+
     mpnet_table2 = evaluate(
         ad_datasets_mpnet,
         ad_datasets_labels,
diff --git a/index/mapping.py b/index/mapping.py
index 5f35013..307ed2d 100644
--- a/index/mapping.py
+++ b/index/mapping.py
@@ -2,7 +2,7 @@
 import numpy as np
 
 from index.embedding import EmbeddingModel
-from index.model import Terminology, Mapping, Concept, Variable
+from index.db.model import Terminology, Mapping, Concept, Variable
 from index.parsing import MappingSource, DataDictionarySource, EmbeddingSource
 
 
diff --git a/index/model.py b/index/model.py
deleted file mode 100644
index c30eca5..0000000
--- a/index/model.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import pandas as pd
-
-
-class Terminology:
-
-    def __int__(self, identifier: str, name: str):
-        self.identifier = identifier
-        self.name = name
-
-
-class Concept:
-
-    def __init__(self, identifier: str, terminology: Terminology):
-        self.identifier = identifier
-        self.terminology = terminology
-
-
-class Embedding:
-
-    def __init__(self, embedding: [float], source: str):
-        self.embedding = embedding
-        self.source = source
-
-    def to_dataframe(self):
-        return pd.DataFrame(self.embedding, columns=[self.source])
-
-
-class Variable:
-
-    def __init__(
-        self, name: str, description: str, source: str, embedding: Embedding = None
-    ):
-        self.name = name
-        self.description = description
-        self.source = source
-        self.embedding = embedding
-
-
-class Mapping:
-
-    def __init__(self, concept: Concept, variable: Variable, source: str):
-        self.concept = concept
-        self.variable = variable
-        self.source = source
-
-    def __eq__(self, other):
-        return (
-            self.concept.identifier == other.concept.identifier
-            and self.variable.name == other.variable.name
-        )
-
-    def __hash__(self):
-        return hash((self.concept.identifier, self.variable.name))
-
-    def __str__(self):
-        return f"{self.variable.name} ({self.variable.description}) -> {self.concept.identifier}"
diff --git a/index/visualisation.py b/index/visualisation.py
index f88d9fe..b9ea0ec 100644
--- a/index/visualisation.py
+++ b/index/visualisation.py
@@ -14,8 +14,8 @@
 
 
 class PlotSide(Enum):
-    LEFT = (1,)
-    RIGHT = (2,)
+    LEFT = 1,
+    RIGHT = 2,
     BOTH = 3
 
 
@@ -43,11 +43,7 @@ def enrichment_plot(
     save_plot=False,
     save_dir="resources/results/plots",
 ):
-    if (
-        len(acc_gpt) != len(acc_fuzzy)
-        or len(acc_gpt) != len(acc_mpnet)
-        or len(acc_mpnet) != len(acc_fuzzy)
-    ):
+    if not (len(acc_gpt) == len(acc_fuzzy) == len(acc_mpnet)):
         raise ValueError(
             "acc_gpt, acc_mpnet and acc_fuzzy should be of the same length!"
         )
@@ -82,43 +78,17 @@ def concat_embeddings(tables1: [MappingTable], tables2: [MappingTable]):
     tables1_cleaned = [copy.deepcopy(table) for table in tables1]
     tables2_cleaned = [copy.deepcopy(table) for table in tables2]
     for table1, table2 in zip(tables1_cleaned, tables2_cleaned):
-        table1.joined_mapping_table.dropna(
-            subset=["embedding", "description"], inplace=True
-        )
-        table2.joined_mapping_table.dropna(
-            subset=["embedding", "description"], inplace=True
-        )
-    vectors_tables1 = np.concatenate(
-        [table.get_embeddings_numpy() for table in tables1_cleaned]
-    )
-    vectors_tables2 = np.concatenate(
-        [table.get_embeddings_numpy() for table in tables2_cleaned]
-    )
-    descriptions_table1 = np.concatenate(
-        [table.joined_mapping_table["description"] for table in tables1_cleaned]
-    )
-    descriptions_table2 = np.concatenate(
-        [table.joined_mapping_table["description"] for table in tables2_cleaned]
-    )
-    boundaries1 = np.array(
-        [
-            table.joined_mapping_table["embedding"].index.size
-            for table in tables1_cleaned
-        ]
-    )
-    boundaries2 = np.array(
-        [
-            table.joined_mapping_table["embedding"].index.size
-            for table in tables2_cleaned
-        ]
-    )
+        table1.joined_mapping_table.dropna(subset=["embedding", "description"], inplace=True)
+        table2.joined_mapping_table.dropna(subset=["embedding", "description"], inplace=True)
+    vectors_tables1 = np.concatenate([table.get_embeddings_numpy() for table in tables1_cleaned])
+    vectors_tables2 = np.concatenate([table.get_embeddings_numpy() for table in tables2_cleaned])
+    descriptions_table1 = np.concatenate([table.joined_mapping_table["description"] for table in tables1_cleaned])
+    descriptions_table2 = np.concatenate([table.joined_mapping_table["description"] for table in tables2_cleaned])
+    boundaries1 = np.array([table.joined_mapping_table["embedding"].index.size for table in tables1_cleaned])
+    boundaries2 = np.array([table.joined_mapping_table["embedding"].index.size for table in tables2_cleaned])
     vectors_concatenated = np.concatenate([vectors_tables1, vectors_tables2])
-    descriptions_concatenated = np.concatenate(
-        [descriptions_table1, descriptions_table2]
-    )
-    boundaries_concatenated = size_array_to_boundaries(
-        np.concatenate([boundaries1, boundaries2])
-    )
+    descriptions_concatenated = np.concatenate([descriptions_table1, descriptions_table2])
+    boundaries_concatenated = size_array_to_boundaries(np.concatenate([boundaries1, boundaries2]))
     return vectors_concatenated, descriptions_concatenated, boundaries_concatenated
 
 

From 3456fead3bb177def9cb58b7a9236e4157292842 Mon Sep 17 00:00:00 2001
From: Mehmet Can Ay <mehmettt.can.ay@gmail.com>
Date: Wed, 14 Feb 2024 16:44:27 +0100
Subject: [PATCH 3/3] add: model

---
 index/db/__init__.py |  0
 index/db/model.py    | 56 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 index/db/__init__.py
 create mode 100644 index/db/model.py

diff --git a/index/db/__init__.py b/index/db/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/index/db/model.py b/index/db/model.py
new file mode 100644
index 0000000..c30eca5
--- /dev/null
+++ b/index/db/model.py
@@ -0,0 +1,56 @@
+import pandas as pd
+
+
+class Terminology:
+
+    def __int__(self, identifier: str, name: str):
+        self.identifier = identifier
+        self.name = name
+
+
+class Concept:
+
+    def __init__(self, identifier: str, terminology: Terminology):
+        self.identifier = identifier
+        self.terminology = terminology
+
+
+class Embedding:
+
+    def __init__(self, embedding: [float], source: str):
+        self.embedding = embedding
+        self.source = source
+
+    def to_dataframe(self):
+        return pd.DataFrame(self.embedding, columns=[self.source])
+
+
+class Variable:
+
+    def __init__(
+        self, name: str, description: str, source: str, embedding: Embedding = None
+    ):
+        self.name = name
+        self.description = description
+        self.source = source
+        self.embedding = embedding
+
+
+class Mapping:
+
+    def __init__(self, concept: Concept, variable: Variable, source: str):
+        self.concept = concept
+        self.variable = variable
+        self.source = source
+
+    def __eq__(self, other):
+        return (
+            self.concept.identifier == other.concept.identifier
+            and self.variable.name == other.variable.name
+        )
+
+    def __hash__(self):
+        return hash((self.concept.identifier, self.variable.name))
+
+    def __str__(self):
+        return f"{self.variable.name} ({self.variable.description}) -> {self.concept.identifier}"