updated evaluation

openml-labs · Jul 22, 2024 · 4fd8f48 · 4fd8f48
1 parent 5398f9e
commit 4fd8f48
Show file tree

Hide file tree

Showing 17 changed files with 193 additions and 79 deletions.
diff --git a/backend/modules/metadata_utils.py b/backend/modules/metadata_utils.py
@@ -73,7 +73,8 @@ def process_metadata(
         """
         raise NotImplementedError
 
-    def load_metadata(self, file_path: str):
+    @staticmethod
+    def load_metadata(file_path: str):
         """
         Description: Load metadata from a file.
 
@@ -86,13 +87,15 @@ def load_metadata(self, file_path: str):
                 "Metadata files do not exist. Please run the training pipeline first."
             )
 
-    def extract_attribute(self, attribute: object, attr_name: str) -> str:
+    @staticmethod
+    def extract_attribute(attribute: object, attr_name: str) -> str:
         """
         Description: Extract an attribute from the OpenML object.
         """
         return getattr(attribute, attr_name, "")
 
-    def join_attributes(self, attribute: object, attr_name: str) -> str:
+    @staticmethod
+    def join_attributes(attribute: object, attr_name: str) -> str:
         """
         Description: Join the attributes of the OpenML object.
         """
@@ -104,8 +107,8 @@ def join_attributes(self, attribute: object, attr_name: str) -> str:
             else ""
         )
 
+    @staticmethod
     def create_combined_information_df_for_datasets(
-        self,
         data_id: int | Sequence[int],
         descriptions: Sequence[str],
         joined_qualities: Sequence[str],
@@ -123,7 +126,8 @@ def create_combined_information_df_for_datasets(
             }
         )
 
-    def merge_all_columns_to_string(self, row: pd.Series) -> str:
+    @staticmethod
+    def merge_all_columns_to_string(row: pd.Series) -> str:
         """
         Description: Create a single column that has a combined string of all the metadata and the description in the form of "column - value, column - value, ... description"
         """
@@ -143,8 +147,9 @@ def combine_metadata(
         )
         return all_dataset_metadata
 
+    @staticmethod
     def subset_metadata(
-        self, subset_ids: Sequence[int] | None, all_dataset_metadata: pd.DataFrame
+        subset_ids: Sequence[int] | None, all_dataset_metadata: pd.DataFrame
     ):
         if subset_ids is not None:
             subset_ids = [int(x) for x in subset_ids]
@@ -200,7 +205,7 @@ def process_metadata(
 
         all_dataset_metadata.to_csv(file_path)
 
-        if self.config.get("use_chroma_for_saving_metadata") == True:
+        if self.config.get("use_chroma_for_saving_metadata"):
             client = chromadb.PersistentClient(
                 path=self.config["persist_dir"] + "metadata_db"
             )
@@ -318,12 +323,14 @@ def get_all_metadata_from_openml(self):
 
         return openml_data_object, data_id, all_objects, handler
 
-    def load_metadata_from_file(self, filename: str):
+    @staticmethod
+    def load_metadata_from_file(filename: str):
         # Implement the function to load metadata from a file
         with open(filename, "rb") as f:
             return pickle.load(f)
 
-    def save_metadata_to_file(self, data: Tuple, save_filename: str):
+    @staticmethod
+    def save_metadata_to_file(data: Tuple, save_filename: str):
         # Implement the function to save metadata to a file
         with open(save_filename, "wb") as f:
             pickle.dump(data, f)

diff --git a/docs/UI/frontend.md b/docs/UI/frontend.md
@@ -17,8 +17,12 @@
 - Once the results of the RAG pipeline are obtained, the resulting list of IDs is queried from the metadata files (to be replaced with elasticsearch later) and then the relevant data is displayed.
 - Now it is possible for the query parsing LLM to read the query and infer the columns that the user finds relevant. (eg: "find me a dataset with multiple classes" would enable the filters where `num_classes >=2`).
 
+### paths.json
+- Configure this file if any of the endpoints change.
+
 ### ui.py
 - This is where all the above logic is executed and displayed using Streamlit.
+
 ### ui_utils.py
 - This is where all the logic is defined.
 - Query filtering

diff --git a/docs/evaluation/api_reference.md b/docs/evaluation/api_reference.md
@@ -0,0 +1,19 @@
+## Consistency evaluation
+
+::: consistence_eval
+
+## Streamlit labelling app
+
+::: labellingapp
+
+### Merging labels
+
+::: merge_labels
+
+### Run Batch Training
+
+::: run_all_training
+
+### Evaluation Utils
+
+::: evaluation_utils
diff --git a/docs/evaluation/evaluate_training.md b/docs/evaluation/evaluate_training.md
diff --git a/docs/evaluation/evaluation.md b/docs/evaluation/evaluation.md
@@ -0,0 +1,29 @@
+# Evaluation of LLM models and techniques
+
+## How to run
+- Start the language server at the root of this repository with `./start_llm_service.sh` . This is important, do not skip it.
+- Run `python run_all_training.py` to train all models (get data, create vector store for each etc)
+- Run `python evaluate.py` to run all evaluations
+- Results are found in in `./evaluation_results.csv` and `evaluation_results.png`
+
+## How to add a new evaluation
+
+- It is "pretty easy" to add a new evaluation. 
+  - (Note that `training_utils.py` already overloads some classes from the original training. Which means that you can modify this to your hearts content without affecting the main code. Enjoy~)
+  - Step 1: Find the method you want to override and overload the class/method in `training_utils.py`.
+  - Step 2: Add some if statements in `class ExperimentRunner` to ensure you dont break everything.
+  - Step 3: Follow the ExperimentRunner templates in `run_all_training.py` to add whatever you added in Step 2 as a new experiment.
+    - Give it a custom name so it is easy to understand what happens
+    - Do not worry, the experiments are cached and won't run again if you have run them before.
+  - Step 4: If you changed something from config, make sure you reset it. Since the file runs in one go, it will affect the following experiments otherwise.
+
+## How to add a new metric
+
+- In `evaluation_utils.py`, go to `class EvaluationProcessor`, add a new function that calculates your metric. (You can use the templates provided)
+- Update the metric in `self.metric_methods`
+- While running the evaluation, add them to your metrics list :
+```python
+metrics = ["precision", "recall", "map"]
+eval_path = Path("../data/evaluation/")
+processor = EvaluationProcessor(eval_path, sort_by=None, metrics=metrics)
+```
diff --git a/docs/evaluation/index.md b/docs/evaluation/index.md
@@ -0,0 +1,18 @@
+# Evaluating the AI search
+- The challenge with evaluation in this case was the lack of labels. To solve that, we created a simple streamlit app that let us label datasets according to a few tags. 
+- The evaluation pipeline runs the entire RAG + Query LLM pipeline on the subset of labelled data. The RAG does not have access to the entire OpenML database but just the subset that was labelled.
+
+## Manual labelling
+### Streamlit labelling app
+- Refer to [labelling app](./labelling_tool.md) for more information.
+
+### Merging labels
+- Since there were multiple people who labelled the datasets, it was useful to have a script that would merge them to create a single dataframe. 
+- The labels were generated per person using the labelling app and then merged into a single consistent dataframe using this script.
+- Refer to [merging labels](./merging_labels.md) for more information.
+
+### Consistency evaluation
+- Since multiple people labelled the same dataset differently, Kohn's Kappa score was used to evaluate the consistency of the labelling. A value of ~4.5 was obtained, which shows moderate consistency. 
+
+## Running the evaluation
+- Refer to [run training](./evaluation)  for more information
diff --git a/docs/evaluation/labelling_tool.md b/docs/evaluation/labelling_tool.md
@@ -34,4 +34,3 @@ You can now browse through datasets, and for each dataset you can select which o
 Changes are not automatically persisted. If the 'save me' button is red, there are local unsaved changes. Click it to persist the changes.
 
 We should be able to merge the different label files later without problem.
-
diff --git a/docs/evaluation/merging_labels.md b/docs/evaluation/merging_labels.md
@@ -0,0 +1,7 @@
+# Merging labels
+- Takes multiple JSON files as input and merges them into a single csv file with columns `Topics,Dataset IDs`
+
+## How to use
+- Place all the label.json files in the folder `/tools/data/all_labels`
+- Run `python merge_labels.py` from the `tools` directory.
+- The results would be present in `/data/evaluation/merged_labels.csv`
diff --git a/docs/evaluation/readme.md b/docs/evaluation/readme.md
diff --git a/docs/evaluation/run_batch_training.md b/docs/evaluation/run_batch_training.md
diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
@@ -4,9 +4,9 @@
 import pandas as pd
 from evaluation_utils import EvaluationProcessor
 from tqdm import tqdm
-
+metrics = ["precision", "recall", "map"]
 eval_path = Path("../data/evaluation/")
-processor = EvaluationProcessor(eval_path, sort_by=None)
+processor = EvaluationProcessor(eval_path, sort_by=None, metrics=metrics)
 results_display = processor.run()
 print(results_display)
 

diff --git a/evaluation/evaluation_utils.py b/evaluation/evaluation_utils.py
@@ -6,19 +6,31 @@
 
 
 class EvaluationProcessor:
+    """
+    Description: Process all the evaluated results, add the required metrics and save results as a csv/generate plots
+    """
     def __init__(
-        self, eval_path, metrics=["precision", "recall", "map"], sort_by="precision"
+        self, eval_path, metrics=None, sort_by="precision"
     ):
+        if metrics is None:
+            metrics = ["precision", "recall", "map"]
         self.eval_path = eval_path
         self.load_eval_queries = self.load_queries_from_csv()
         self.query_templates = self.load_query_templates()
         self.query_key_dict = self.create_query_key_dict()
         self.metrics = metrics
         self.sort_by = sort_by
 
+        # Define a dictionary to map metric names to their corresponding methods
+        self.metric_methods = {
+            "precision": self.add_precision,
+            "recall": self.add_recall,
+            "map": self.add_map
+        }
+
     def run(self):
         """
-        Description: Run the evaluation process and display the results
+        Description: Load files, Run the evaluation process and display the results
 
         """
         csv_files = self.load_result_files()
@@ -28,7 +40,7 @@ def run(self):
 
     def load_result_files(self):
         """
-        Description: Find all the csv files in the evaluation directory
+        Description: Find all the csv files in the evaluation directory.
 
         """
         return glob.glob(str(self.eval_path / "*/*/results.csv"))
@@ -42,9 +54,9 @@ def generate_results(self, csv_files):
         for exp_path in tqdm(csv_files):
             exp = pd.read_csv(exp_path).rename(columns={"did": "y_pred"})
             exp["exp_folder_name"] = Path(exp_path).parent.name
-            exp["custom_experiement"] = ""
+            exp["custom_experiment"] = ""
             # split exp_folder_name by @ to get extra information
-            exp["custom_experiement"] = exp["exp_folder_name"].apply(
+            exp["custom_experiment"] = exp["exp_folder_name"].apply(
                 lambda x: x.split("@")[0] if "@" in x else ""
             )
             exp.drop("exp_folder_name", axis=1, inplace=True)
@@ -56,24 +68,11 @@ def generate_results(self, csv_files):
                     "llm_model",
                     "query",
                     "llm_before_rag",
-                    "custom_experiement",
+                    "custom_experiment",
                 ]
             ).agg({"y_true": ",".join, "y_pred": ",".join})
 
-            # add metrics
-            for metric in self.metrics:
-                if metric == "precision":
-                    grouped_results_for_y_true_and_pred = self.add_precision(
-                        grouped_results_for_y_true_and_pred
-                    )
-                elif metric == "recall":
-                    grouped_results_for_y_true_and_pred = self.add_recall(
-                        grouped_results_for_y_true_and_pred
-                    )
-                elif metric == "map":
-                    grouped_results_for_y_true_and_pred = self.add_map(
-                        grouped_results_for_y_true_and_pred
-                    )
+            grouped_results_for_y_true_and_pred = self.add_metrics(grouped_results_for_y_true_and_pred)
 
             # aggregate by computing the average of the metrics for each group
             grouped_results_for_y_true_and_pred = (
@@ -82,7 +81,7 @@ def generate_results(self, csv_files):
                         "embedding_model",
                         "llm_model",
                         "llm_before_rag",
-                        "custom_experiement",
+                        "custom_experiment",
                     ]
                 ).agg({metric: "mean" for metric in self.metrics})
             )
@@ -95,6 +94,16 @@ def generate_results(self, csv_files):
                 merged_df = merged_df.sort_values(by=self.sort_by, ascending=False)
         return merged_df
 
+    def add_metrics(self, grouped_results_for_y_true_and_pred):
+        # Iterate over the metrics and apply the corresponding method if it exists
+        for metric in self.metrics:
+            if metric in self.metric_methods:
+                grouped_results_for_y_true_and_pred = self.metric_methods[metric](
+                    grouped_results_for_y_true_and_pred
+                )
+
+        return grouped_results_for_y_true_and_pred
+
     def load_queries_from_csv(self):
         """
         Description: Load the queries from the csv file
@@ -137,7 +146,8 @@ def preprocess_results(self, results_df):
         results_df["y_true"] = results_df["query"].map(self.query_key_dict)
         return results_df
 
-    def add_precision(self, grouped_df):
+    @staticmethod
+    def add_precision(grouped_df):
         """
         Description: Compute the precision metric for each group in the dataframe
         """
@@ -147,7 +157,8 @@ def add_precision(self, grouped_df):
         ]
         return grouped_df
 
-    def add_recall(self, grouped_df):
+    @staticmethod
+    def add_recall(grouped_df):
         """
         Description: Compute the recall metric for each group in the dataframe
 
@@ -158,7 +169,8 @@ def add_recall(self, grouped_df):
         ]
         return grouped_df
 
-    def add_map(self, grouped_df):
+    @staticmethod
+    def add_map(grouped_df):
         """
         Description: Compute the mean average precision metric for each group in the dataframe
         """
@@ -174,7 +186,8 @@ def add_map(self, grouped_df):
         ]
         return grouped_df
 
-    def display_results(self, results_df):
+    @staticmethod
+    def display_results(results_df):
         # add more preprocessing here
         results_df = pd.DataFrame(results_df)
         # heatmap results
Original file line number	Diff line number	Diff line change
Expand Up		@@ -34,4 +34,3 @@ You can now browse through datasets, and for each dataset you can select which o
		Changes are not automatically persisted. If the 'save me' button is red, there are local unsaved changes. Click it to persist the changes.

		We should be able to merge the different label files later without problem.