From 4fd8f48ee489f0be3110435a36c70d19e6bed237 Mon Sep 17 00:00:00 2001 From: SubhadityaMukherjee Date: Mon, 22 Jul 2024 17:39:13 +0200 Subject: [PATCH] updated evaluation --- backend/modules/metadata_utils.py | 25 +++++++---- docs/UI/frontend.md | 4 ++ docs/evaluation/api_reference.md | 19 ++++++++ docs/evaluation/evaluate_training.md | 1 - docs/evaluation/evaluation.md | 29 ++++++++++++ docs/evaluation/index.md | 18 ++++++++ docs/evaluation/labelling_tool.md | 1 - docs/evaluation/merging_labels.md | 7 +++ docs/evaluation/readme.md | 8 ---- docs/evaluation/run_batch_training.md | 1 - evaluation/evaluate.py | 4 +- evaluation/evaluation_utils.py | 63 ++++++++++++++++----------- evaluation/run_all_training.py | 9 +++- evaluation/training_utils.py | 24 +++++++--- tools/{app.py => labellingapp.py} | 2 +- tools/merge_labels.py | 52 ++++++++++++---------- tools/requirements.txt | 5 ++- 17 files changed, 193 insertions(+), 79 deletions(-) create mode 100644 docs/evaluation/api_reference.md delete mode 100644 docs/evaluation/evaluate_training.md create mode 100644 docs/evaluation/evaluation.md create mode 100644 docs/evaluation/index.md create mode 100644 docs/evaluation/merging_labels.md delete mode 100644 docs/evaluation/readme.md delete mode 100644 docs/evaluation/run_batch_training.md rename tools/{app.py => labellingapp.py} (99%) diff --git a/backend/modules/metadata_utils.py b/backend/modules/metadata_utils.py index 2a04bd6..f052aec 100644 --- a/backend/modules/metadata_utils.py +++ b/backend/modules/metadata_utils.py @@ -73,7 +73,8 @@ def process_metadata( """ raise NotImplementedError - def load_metadata(self, file_path: str): + @staticmethod + def load_metadata(file_path: str): """ Description: Load metadata from a file. @@ -86,13 +87,15 @@ def load_metadata(self, file_path: str): "Metadata files do not exist. Please run the training pipeline first." ) - def extract_attribute(self, attribute: object, attr_name: str) -> str: + @staticmethod + def extract_attribute(attribute: object, attr_name: str) -> str: """ Description: Extract an attribute from the OpenML object. """ return getattr(attribute, attr_name, "") - def join_attributes(self, attribute: object, attr_name: str) -> str: + @staticmethod + def join_attributes(attribute: object, attr_name: str) -> str: """ Description: Join the attributes of the OpenML object. """ @@ -104,8 +107,8 @@ def join_attributes(self, attribute: object, attr_name: str) -> str: else "" ) + @staticmethod def create_combined_information_df_for_datasets( - self, data_id: int | Sequence[int], descriptions: Sequence[str], joined_qualities: Sequence[str], @@ -123,7 +126,8 @@ def create_combined_information_df_for_datasets( } ) - def merge_all_columns_to_string(self, row: pd.Series) -> str: + @staticmethod + def merge_all_columns_to_string(row: pd.Series) -> str: """ Description: Create a single column that has a combined string of all the metadata and the description in the form of "column - value, column - value, ... description" """ @@ -143,8 +147,9 @@ def combine_metadata( ) return all_dataset_metadata + @staticmethod def subset_metadata( - self, subset_ids: Sequence[int] | None, all_dataset_metadata: pd.DataFrame + subset_ids: Sequence[int] | None, all_dataset_metadata: pd.DataFrame ): if subset_ids is not None: subset_ids = [int(x) for x in subset_ids] @@ -200,7 +205,7 @@ def process_metadata( all_dataset_metadata.to_csv(file_path) - if self.config.get("use_chroma_for_saving_metadata") == True: + if self.config.get("use_chroma_for_saving_metadata"): client = chromadb.PersistentClient( path=self.config["persist_dir"] + "metadata_db" ) @@ -318,12 +323,14 @@ def get_all_metadata_from_openml(self): return openml_data_object, data_id, all_objects, handler - def load_metadata_from_file(self, filename: str): + @staticmethod + def load_metadata_from_file(filename: str): # Implement the function to load metadata from a file with open(filename, "rb") as f: return pickle.load(f) - def save_metadata_to_file(self, data: Tuple, save_filename: str): + @staticmethod + def save_metadata_to_file(data: Tuple, save_filename: str): # Implement the function to save metadata to a file with open(save_filename, "wb") as f: pickle.dump(data, f) diff --git a/docs/UI/frontend.md b/docs/UI/frontend.md index 8e1e459..33436b1 100644 --- a/docs/UI/frontend.md +++ b/docs/UI/frontend.md @@ -17,8 +17,12 @@ - Once the results of the RAG pipeline are obtained, the resulting list of IDs is queried from the metadata files (to be replaced with elasticsearch later) and then the relevant data is displayed. - Now it is possible for the query parsing LLM to read the query and infer the columns that the user finds relevant. (eg: "find me a dataset with multiple classes" would enable the filters where `num_classes >=2`). +### paths.json +- Configure this file if any of the endpoints change. + ### ui.py - This is where all the above logic is executed and displayed using Streamlit. + ### ui_utils.py - This is where all the logic is defined. - Query filtering diff --git a/docs/evaluation/api_reference.md b/docs/evaluation/api_reference.md new file mode 100644 index 0000000..3f298ee --- /dev/null +++ b/docs/evaluation/api_reference.md @@ -0,0 +1,19 @@ +## Consistency evaluation + +::: consistence_eval + +## Streamlit labelling app + +::: labellingapp + +### Merging labels + +::: merge_labels + +### Run Batch Training + +::: run_all_training + +### Evaluation Utils + +::: evaluation_utils diff --git a/docs/evaluation/evaluate_training.md b/docs/evaluation/evaluate_training.md deleted file mode 100644 index fdd1fc5..0000000 --- a/docs/evaluation/evaluate_training.md +++ /dev/null @@ -1 +0,0 @@ -:::evaluation_utils \ No newline at end of file diff --git a/docs/evaluation/evaluation.md b/docs/evaluation/evaluation.md new file mode 100644 index 0000000..f3c0a0b --- /dev/null +++ b/docs/evaluation/evaluation.md @@ -0,0 +1,29 @@ +# Evaluation of LLM models and techniques + +## How to run +- Start the language server at the root of this repository with `./start_llm_service.sh` . This is important, do not skip it. +- Run `python run_all_training.py` to train all models (get data, create vector store for each etc) +- Run `python evaluate.py` to run all evaluations +- Results are found in in `./evaluation_results.csv` and `evaluation_results.png` + +## How to add a new evaluation + +- It is "pretty easy" to add a new evaluation. + - (Note that `training_utils.py` already overloads some classes from the original training. Which means that you can modify this to your hearts content without affecting the main code. Enjoy~) + - Step 1: Find the method you want to override and overload the class/method in `training_utils.py`. + - Step 2: Add some if statements in `class ExperimentRunner` to ensure you dont break everything. + - Step 3: Follow the ExperimentRunner templates in `run_all_training.py` to add whatever you added in Step 2 as a new experiment. + - Give it a custom name so it is easy to understand what happens + - Do not worry, the experiments are cached and won't run again if you have run them before. + - Step 4: If you changed something from config, make sure you reset it. Since the file runs in one go, it will affect the following experiments otherwise. + +## How to add a new metric + +- In `evaluation_utils.py`, go to `class EvaluationProcessor`, add a new function that calculates your metric. (You can use the templates provided) +- Update the metric in `self.metric_methods` +- While running the evaluation, add them to your metrics list : +```python +metrics = ["precision", "recall", "map"] +eval_path = Path("../data/evaluation/") +processor = EvaluationProcessor(eval_path, sort_by=None, metrics=metrics) +``` \ No newline at end of file diff --git a/docs/evaluation/index.md b/docs/evaluation/index.md new file mode 100644 index 0000000..6cf0e12 --- /dev/null +++ b/docs/evaluation/index.md @@ -0,0 +1,18 @@ +# Evaluating the AI search +- The challenge with evaluation in this case was the lack of labels. To solve that, we created a simple streamlit app that let us label datasets according to a few tags. +- The evaluation pipeline runs the entire RAG + Query LLM pipeline on the subset of labelled data. The RAG does not have access to the entire OpenML database but just the subset that was labelled. + +## Manual labelling +### Streamlit labelling app +- Refer to [labelling app](./labelling_tool.md) for more information. + +### Merging labels +- Since there were multiple people who labelled the datasets, it was useful to have a script that would merge them to create a single dataframe. +- The labels were generated per person using the labelling app and then merged into a single consistent dataframe using this script. +- Refer to [merging labels](./merging_labels.md) for more information. + +### Consistency evaluation +- Since multiple people labelled the same dataset differently, Kohn's Kappa score was used to evaluate the consistency of the labelling. A value of ~4.5 was obtained, which shows moderate consistency. + +## Running the evaluation +- Refer to [run training](./evaluation) for more information \ No newline at end of file diff --git a/docs/evaluation/labelling_tool.md b/docs/evaluation/labelling_tool.md index cf27947..8567441 100644 --- a/docs/evaluation/labelling_tool.md +++ b/docs/evaluation/labelling_tool.md @@ -34,4 +34,3 @@ You can now browse through datasets, and for each dataset you can select which o Changes are not automatically persisted. If the 'save me' button is red, there are local unsaved changes. Click it to persist the changes. We should be able to merge the different label files later without problem. - diff --git a/docs/evaluation/merging_labels.md b/docs/evaluation/merging_labels.md new file mode 100644 index 0000000..41f856e --- /dev/null +++ b/docs/evaluation/merging_labels.md @@ -0,0 +1,7 @@ +# Merging labels +- Takes multiple JSON files as input and merges them into a single csv file with columns `Topics,Dataset IDs` + +## How to use +- Place all the label.json files in the folder `/tools/data/all_labels` +- Run `python merge_labels.py` from the `tools` directory. +- The results would be present in `/data/evaluation/merged_labels.csv` \ No newline at end of file diff --git a/docs/evaluation/readme.md b/docs/evaluation/readme.md deleted file mode 100644 index cdc7372..0000000 --- a/docs/evaluation/readme.md +++ /dev/null @@ -1,8 +0,0 @@ -# Developer Tutorials - -- Hello there, future OpenML contributor! It is nice meeting you here. This page is a collection of tutorials that will help you get started with contributing to the OpenML RAG pipeline. -- The tutorials show you how to perform common tasks and should make it a lot easier to get started with contributing to this project. -- Note that you would have had to setup the project before you begin. If you missed this step, please refer to [](../../readme.md) - -## How to use them -- Once you have setup the project, just navigate to the tutorial you are interested in and open them in your IDE. \ No newline at end of file diff --git a/docs/evaluation/run_batch_training.md b/docs/evaluation/run_batch_training.md deleted file mode 100644 index c1760c1..0000000 --- a/docs/evaluation/run_batch_training.md +++ /dev/null @@ -1 +0,0 @@ -:::run_all_training \ No newline at end of file diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py index 5a46e50..ed4c363 100644 --- a/evaluation/evaluate.py +++ b/evaluation/evaluate.py @@ -4,9 +4,9 @@ import pandas as pd from evaluation_utils import EvaluationProcessor from tqdm import tqdm - +metrics = ["precision", "recall", "map"] eval_path = Path("../data/evaluation/") -processor = EvaluationProcessor(eval_path, sort_by=None) +processor = EvaluationProcessor(eval_path, sort_by=None, metrics=metrics) results_display = processor.run() print(results_display) diff --git a/evaluation/evaluation_utils.py b/evaluation/evaluation_utils.py index dc16c18..44d67a9 100644 --- a/evaluation/evaluation_utils.py +++ b/evaluation/evaluation_utils.py @@ -6,9 +6,14 @@ class EvaluationProcessor: + """ + Description: Process all the evaluated results, add the required metrics and save results as a csv/generate plots + """ def __init__( - self, eval_path, metrics=["precision", "recall", "map"], sort_by="precision" + self, eval_path, metrics=None, sort_by="precision" ): + if metrics is None: + metrics = ["precision", "recall", "map"] self.eval_path = eval_path self.load_eval_queries = self.load_queries_from_csv() self.query_templates = self.load_query_templates() @@ -16,9 +21,16 @@ def __init__( self.metrics = metrics self.sort_by = sort_by + # Define a dictionary to map metric names to their corresponding methods + self.metric_methods = { + "precision": self.add_precision, + "recall": self.add_recall, + "map": self.add_map + } + def run(self): """ - Description: Run the evaluation process and display the results + Description: Load files, Run the evaluation process and display the results """ csv_files = self.load_result_files() @@ -28,7 +40,7 @@ def run(self): def load_result_files(self): """ - Description: Find all the csv files in the evaluation directory + Description: Find all the csv files in the evaluation directory. """ return glob.glob(str(self.eval_path / "*/*/results.csv")) @@ -42,9 +54,9 @@ def generate_results(self, csv_files): for exp_path in tqdm(csv_files): exp = pd.read_csv(exp_path).rename(columns={"did": "y_pred"}) exp["exp_folder_name"] = Path(exp_path).parent.name - exp["custom_experiement"] = "" + exp["custom_experiment"] = "" # split exp_folder_name by @ to get extra information - exp["custom_experiement"] = exp["exp_folder_name"].apply( + exp["custom_experiment"] = exp["exp_folder_name"].apply( lambda x: x.split("@")[0] if "@" in x else "" ) exp.drop("exp_folder_name", axis=1, inplace=True) @@ -56,24 +68,11 @@ def generate_results(self, csv_files): "llm_model", "query", "llm_before_rag", - "custom_experiement", + "custom_experiment", ] ).agg({"y_true": ",".join, "y_pred": ",".join}) - # add metrics - for metric in self.metrics: - if metric == "precision": - grouped_results_for_y_true_and_pred = self.add_precision( - grouped_results_for_y_true_and_pred - ) - elif metric == "recall": - grouped_results_for_y_true_and_pred = self.add_recall( - grouped_results_for_y_true_and_pred - ) - elif metric == "map": - grouped_results_for_y_true_and_pred = self.add_map( - grouped_results_for_y_true_and_pred - ) + grouped_results_for_y_true_and_pred = self.add_metrics(grouped_results_for_y_true_and_pred) # aggregate by computing the average of the metrics for each group grouped_results_for_y_true_and_pred = ( @@ -82,7 +81,7 @@ def generate_results(self, csv_files): "embedding_model", "llm_model", "llm_before_rag", - "custom_experiement", + "custom_experiment", ] ).agg({metric: "mean" for metric in self.metrics}) ) @@ -95,6 +94,16 @@ def generate_results(self, csv_files): merged_df = merged_df.sort_values(by=self.sort_by, ascending=False) return merged_df + def add_metrics(self, grouped_results_for_y_true_and_pred): + # Iterate over the metrics and apply the corresponding method if it exists + for metric in self.metrics: + if metric in self.metric_methods: + grouped_results_for_y_true_and_pred = self.metric_methods[metric]( + grouped_results_for_y_true_and_pred + ) + + return grouped_results_for_y_true_and_pred + def load_queries_from_csv(self): """ Description: Load the queries from the csv file @@ -137,7 +146,8 @@ def preprocess_results(self, results_df): results_df["y_true"] = results_df["query"].map(self.query_key_dict) return results_df - def add_precision(self, grouped_df): + @staticmethod + def add_precision(grouped_df): """ Description: Compute the precision metric for each group in the dataframe """ @@ -147,7 +157,8 @@ def add_precision(self, grouped_df): ] return grouped_df - def add_recall(self, grouped_df): + @staticmethod + def add_recall(grouped_df): """ Description: Compute the recall metric for each group in the dataframe @@ -158,7 +169,8 @@ def add_recall(self, grouped_df): ] return grouped_df - def add_map(self, grouped_df): + @staticmethod + def add_map(grouped_df): """ Description: Compute the mean average precision metric for each group in the dataframe """ @@ -174,7 +186,8 @@ def add_map(self, grouped_df): ] return grouped_df - def display_results(self, results_df): + @staticmethod + def display_results(results_df): # add more preprocessing here results_df = pd.DataFrame(results_df) # heatmap results diff --git a/evaluation/run_all_training.py b/evaluation/run_all_training.py index d32ccd1..52890c4 100644 --- a/evaluation/run_all_training.py +++ b/evaluation/run_all_training.py @@ -43,7 +43,7 @@ # %% [markdown] # ## Setup evaluation data -# ### If you used tools/app.py to generate evaluation data +# ### If you used tools/labellingapp.py to generate evaluation data # - You can ignore this and use the data generated by the tool # ### If you did not # - You can use evaluation data of the format {"id": ["tag1", "tag2"] } and save it as a json file @@ -65,7 +65,7 @@ ["Topics", "Dataset IDs"] ] # %% -subset_ids = [row.split(",") for row in list(load_eval_queries["Dataset IDs"].values)] +subset_ids = [row.split(",") for row in load_eval_queries["Dataset IDs"].to_list()] # flatten the list and get unique values subset_ids = list(set([int(item) for sublist in subset_ids for item in sublist])) # %% @@ -81,6 +81,7 @@ json.dump(query_key_dict, open(eval_path / "query_key_dict.json", "w")) """ +EXPERIMENT 1 Main evaluation loop that is used to run the base experiments using different models and embeddings. Takes into account the following: original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing @@ -105,6 +106,8 @@ expRunner.run_experiments() """ +EXPERIMENT 2 + Evaluating temperature = 1 (default was 0.95) Takes into account the following: original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing @@ -137,6 +140,8 @@ config["temperature"] = 0.95 """ +EXPERIMENT 3 + Evaluating search type [mmr, similarity_score_threshold] (default was similarity) Takes into account the following: original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing diff --git a/evaluation/training_utils.py b/evaluation/training_utils.py index e6e6bd2..dbc6ae4 100644 --- a/evaluation/training_utils.py +++ b/evaluation/training_utils.py @@ -62,6 +62,9 @@ def process_llm_model_name_ollama(name: str) -> str: def ollama_setup(list_of_llm_models: list): + """ + Description: Setup Ollama server and pull the llm_model that is being used + """ os.system("ollama serve&") print("Waiting for Ollama server to be active...") while os.system("ollama list | grep 'NAME'") == "": @@ -85,7 +88,7 @@ def parse_and_update_response(self, metadata): Description: Parse the response from the RAG and LLM services and update the metadata based on the response """ if self.rag_response is not None and self.llm_response is not None: - if self.apply_llm_before_rag == False: + if not self.apply_llm_before_rag: filtered_metadata = metadata[ metadata["did"].isin(self.rag_response["initial_response"]) ] @@ -96,7 +99,7 @@ def parse_and_update_response(self, metadata): llm_parser.get_attributes_from_response() return llm_parser.update_subset_cols(filtered_metadata) - elif self.apply_llm_before_rag == True: + elif self.apply_llm_before_rag: llm_parser = LLMResponseParser(self.llm_response) llm_parser.subset_cols = ["did", "name"] llm_parser.get_attributes_from_response() @@ -106,14 +109,22 @@ def parse_and_update_response(self, metadata): filtered_metadata["did"].isin(self.rag_response["initial_response"]) ] - elif self.apply_llm_before_rag == None: + elif self.apply_llm_before_rag is None: # if no llm response is required, return the initial response return metadata + elif ( + self.rag_response is not None and self.structured_query_response is not None + ): + return metadata[["did", "name"]] else: return metadata class ExperimentRunner: + """ + Description: This class is used to run all the experiments. If you want to modify any behavior, change the functions in this class according to what you want. + You may also want to check out ResponseParser. + """ def __init__( self, config, @@ -121,11 +132,13 @@ def __init__( queries, list_of_embedding_models, list_of_llm_models, - types_of_llm_apply=[True, False, None], + types_of_llm_apply=None, subset_ids=None, use_cached_experiment=False, custom_name=None, ): + if types_of_llm_apply is None: + types_of_llm_apply = [True, False, None] self.config = config self.eval_path = eval_path self.queries = queries @@ -157,7 +170,6 @@ def run_experiments(self): self.config, orient="index" ).reset_index() config_df.columns = ["Hyperparameter", "Value"] - config_df.to_csv(main_experiment_directory / "config.csv", index=False) # load the persistent database using ChromaDB client = chromadb.PersistentClient(path=self.config["persist_dir"]) @@ -202,6 +214,7 @@ def run_experiments(self): else: experiment_path = main_experiment_directory / experiment_name os.makedirs(experiment_path, exist_ok=True) + config_df.to_csv(experiment_path / "config.csv", index=False) if self.use_cached_experiment and os.path.exists( experiment_path / "results.csv" @@ -282,7 +295,6 @@ def run_query( result_data_frame["llm_model"] = self.config["llm_model"] result_data_frame["embedding_model"] = self.config["embedding_model"] result_data_frame["llm_before_rag"] = apply_llm_before_rag - # combined_results.append(result_data_frame) combined_results = pd.concat( [combined_results, result_data_frame], ignore_index=True ) diff --git a/tools/app.py b/tools/labellingapp.py similarity index 99% rename from tools/app.py rename to tools/labellingapp.py index dd829be..219b554 100644 --- a/tools/app.py +++ b/tools/labellingapp.py @@ -2,7 +2,7 @@ Small tool for labeling data. pip install streamlit -streamlit run app.py +streamlit run labellingapp.py Expects the metadata csv and the topic csv in the `data` directory. """ diff --git a/tools/merge_labels.py b/tools/merge_labels.py index 22dd3f3..54cdfef 100644 --- a/tools/merge_labels.py +++ b/tools/merge_labels.py @@ -3,6 +3,7 @@ import os from collections import defaultdict from pathlib import Path +from typing import Any import pandas as pd @@ -15,28 +16,35 @@ # Get all files in the labels directory all_files = list(labels_path.glob("*")) -# Read all files and merge them into a single dictionary -merged_labels = defaultdict(set) -for file in all_files: - with open(file, "r") as f: - try: - data = json.load(f) - for key, values in data.items(): - merged_labels[key].update(values) - except json.JSONDecodeError: - print(f"Error reading {file}") - -# Remove empty lists -merged_labels = {k: list(v) for k, v in merged_labels.items() if v} - -# Reverse the dictionary so we have topic -> [dataset_ids] -reversed_labels = defaultdict(set) -for key, values in merged_labels.items(): - for value in values: - reversed_labels[value].add(key) - -# Convert sets to lists for each value -reversed_labels = {k: list(v) for k, v in reversed_labels.items()} + +def merge_labels() -> dict[Any, list]: + """ + Description : Merge labels from multiple JSON label files into a single dictionary. + """ + # Read all files and merge them into a single dictionary + merged_labels = defaultdict(set) + for file in all_files: + with open(file, "r") as f: + try: + data = json.load(f) + for key, values in data.items(): + merged_labels[key].update(values) + except json.JSONDecodeError: + print(f"Error reading {file}") + # Remove empty lists + merged_labels = {k: list(v) for k, v in merged_labels.items() if v} + + # Reverse the dictionary so we have topic -> [dataset_ids] + reversed_labels = defaultdict(set) + for key, values in merged_labels.items(): + for value in values: + reversed_labels[value].add(key) + + # Convert sets to lists for each value + return {k: list(v) for k, v in reversed_labels.items()} + + +reversed_labels = merge_labels() # Write to CSV with open(eval_path / "merged_labels.csv", "w") as f: diff --git a/tools/requirements.txt b/tools/requirements.txt index 94faa87..d002e49 100644 --- a/tools/requirements.txt +++ b/tools/requirements.txt @@ -1 +1,4 @@ -streamlit==1.36.0 \ No newline at end of file +numpy==2.0.1 +pandas==2.2.2 +scikit_learn==1.5.1 +streamlit==1.36.0