Skip to content

Commit

Permalink
updated evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
SubhadityaMukherjee committed Jul 22, 2024
1 parent 5398f9e commit 4fd8f48
Show file tree
Hide file tree
Showing 17 changed files with 193 additions and 79 deletions.
25 changes: 16 additions & 9 deletions backend/modules/metadata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def process_metadata(
"""
raise NotImplementedError

def load_metadata(self, file_path: str):
@staticmethod
def load_metadata(file_path: str):
"""
Description: Load metadata from a file.
Expand All @@ -86,13 +87,15 @@ def load_metadata(self, file_path: str):
"Metadata files do not exist. Please run the training pipeline first."
)

def extract_attribute(self, attribute: object, attr_name: str) -> str:
@staticmethod
def extract_attribute(attribute: object, attr_name: str) -> str:
"""
Description: Extract an attribute from the OpenML object.
"""
return getattr(attribute, attr_name, "")

def join_attributes(self, attribute: object, attr_name: str) -> str:
@staticmethod
def join_attributes(attribute: object, attr_name: str) -> str:
"""
Description: Join the attributes of the OpenML object.
"""
Expand All @@ -104,8 +107,8 @@ def join_attributes(self, attribute: object, attr_name: str) -> str:
else ""
)

@staticmethod
def create_combined_information_df_for_datasets(
self,
data_id: int | Sequence[int],
descriptions: Sequence[str],
joined_qualities: Sequence[str],
Expand All @@ -123,7 +126,8 @@ def create_combined_information_df_for_datasets(
}
)

def merge_all_columns_to_string(self, row: pd.Series) -> str:
@staticmethod
def merge_all_columns_to_string(row: pd.Series) -> str:
"""
Description: Create a single column that has a combined string of all the metadata and the description in the form of "column - value, column - value, ... description"
"""
Expand All @@ -143,8 +147,9 @@ def combine_metadata(
)
return all_dataset_metadata

@staticmethod
def subset_metadata(
self, subset_ids: Sequence[int] | None, all_dataset_metadata: pd.DataFrame
subset_ids: Sequence[int] | None, all_dataset_metadata: pd.DataFrame
):
if subset_ids is not None:
subset_ids = [int(x) for x in subset_ids]
Expand Down Expand Up @@ -200,7 +205,7 @@ def process_metadata(

all_dataset_metadata.to_csv(file_path)

if self.config.get("use_chroma_for_saving_metadata") == True:
if self.config.get("use_chroma_for_saving_metadata"):
client = chromadb.PersistentClient(
path=self.config["persist_dir"] + "metadata_db"
)
Expand Down Expand Up @@ -318,12 +323,14 @@ def get_all_metadata_from_openml(self):

return openml_data_object, data_id, all_objects, handler

def load_metadata_from_file(self, filename: str):
@staticmethod
def load_metadata_from_file(filename: str):
# Implement the function to load metadata from a file
with open(filename, "rb") as f:
return pickle.load(f)

def save_metadata_to_file(self, data: Tuple, save_filename: str):
@staticmethod
def save_metadata_to_file(data: Tuple, save_filename: str):
# Implement the function to save metadata to a file
with open(save_filename, "wb") as f:
pickle.dump(data, f)
Expand Down
4 changes: 4 additions & 0 deletions docs/UI/frontend.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@
- Once the results of the RAG pipeline are obtained, the resulting list of IDs is queried from the metadata files (to be replaced with elasticsearch later) and then the relevant data is displayed.
- Now it is possible for the query parsing LLM to read the query and infer the columns that the user finds relevant. (eg: "find me a dataset with multiple classes" would enable the filters where `num_classes >=2`).

### paths.json
- Configure this file if any of the endpoints change.

### ui.py
- This is where all the above logic is executed and displayed using Streamlit.

### ui_utils.py
- This is where all the logic is defined.
- Query filtering
Expand Down
19 changes: 19 additions & 0 deletions docs/evaluation/api_reference.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
## Consistency evaluation

::: consistence_eval

## Streamlit labelling app

::: labellingapp

### Merging labels

::: merge_labels

### Run Batch Training

::: run_all_training

### Evaluation Utils

::: evaluation_utils
1 change: 0 additions & 1 deletion docs/evaluation/evaluate_training.md

This file was deleted.

29 changes: 29 additions & 0 deletions docs/evaluation/evaluation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Evaluation of LLM models and techniques

## How to run
- Start the language server at the root of this repository with `./start_llm_service.sh` . This is important, do not skip it.
- Run `python run_all_training.py` to train all models (get data, create vector store for each etc)
- Run `python evaluate.py` to run all evaluations
- Results are found in in `./evaluation_results.csv` and `evaluation_results.png`

## How to add a new evaluation

- It is "pretty easy" to add a new evaluation.
- (Note that `training_utils.py` already overloads some classes from the original training. Which means that you can modify this to your hearts content without affecting the main code. Enjoy~)
- Step 1: Find the method you want to override and overload the class/method in `training_utils.py`.
- Step 2: Add some if statements in `class ExperimentRunner` to ensure you dont break everything.
- Step 3: Follow the ExperimentRunner templates in `run_all_training.py` to add whatever you added in Step 2 as a new experiment.
- Give it a custom name so it is easy to understand what happens
- Do not worry, the experiments are cached and won't run again if you have run them before.
- Step 4: If you changed something from config, make sure you reset it. Since the file runs in one go, it will affect the following experiments otherwise.

## How to add a new metric

- In `evaluation_utils.py`, go to `class EvaluationProcessor`, add a new function that calculates your metric. (You can use the templates provided)
- Update the metric in `self.metric_methods`
- While running the evaluation, add them to your metrics list :
```python
metrics = ["precision", "recall", "map"]
eval_path = Path("../data/evaluation/")
processor = EvaluationProcessor(eval_path, sort_by=None, metrics=metrics)
```
18 changes: 18 additions & 0 deletions docs/evaluation/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Evaluating the AI search
- The challenge with evaluation in this case was the lack of labels. To solve that, we created a simple streamlit app that let us label datasets according to a few tags.
- The evaluation pipeline runs the entire RAG + Query LLM pipeline on the subset of labelled data. The RAG does not have access to the entire OpenML database but just the subset that was labelled.

## Manual labelling
### Streamlit labelling app
- Refer to [labelling app](./labelling_tool.md) for more information.

### Merging labels
- Since there were multiple people who labelled the datasets, it was useful to have a script that would merge them to create a single dataframe.
- The labels were generated per person using the labelling app and then merged into a single consistent dataframe using this script.
- Refer to [merging labels](./merging_labels.md) for more information.

### Consistency evaluation
- Since multiple people labelled the same dataset differently, Kohn's Kappa score was used to evaluate the consistency of the labelling. A value of ~4.5 was obtained, which shows moderate consistency.

## Running the evaluation
- Refer to [run training](./evaluation) for more information
1 change: 0 additions & 1 deletion docs/evaluation/labelling_tool.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,3 @@ You can now browse through datasets, and for each dataset you can select which o
Changes are not automatically persisted. If the 'save me' button is red, there are local unsaved changes. Click it to persist the changes.

We should be able to merge the different label files later without problem.

7 changes: 7 additions & 0 deletions docs/evaluation/merging_labels.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Merging labels
- Takes multiple JSON files as input and merges them into a single csv file with columns `Topics,Dataset IDs`

## How to use
- Place all the label.json files in the folder `/tools/data/all_labels`
- Run `python merge_labels.py` from the `tools` directory.
- The results would be present in `/data/evaluation/merged_labels.csv`
8 changes: 0 additions & 8 deletions docs/evaluation/readme.md

This file was deleted.

1 change: 0 additions & 1 deletion docs/evaluation/run_batch_training.md

This file was deleted.

4 changes: 2 additions & 2 deletions evaluation/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import pandas as pd
from evaluation_utils import EvaluationProcessor
from tqdm import tqdm

metrics = ["precision", "recall", "map"]
eval_path = Path("../data/evaluation/")
processor = EvaluationProcessor(eval_path, sort_by=None)
processor = EvaluationProcessor(eval_path, sort_by=None, metrics=metrics)
results_display = processor.run()
print(results_display)

Expand Down
63 changes: 38 additions & 25 deletions evaluation/evaluation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,31 @@


class EvaluationProcessor:
"""
Description: Process all the evaluated results, add the required metrics and save results as a csv/generate plots
"""
def __init__(
self, eval_path, metrics=["precision", "recall", "map"], sort_by="precision"
self, eval_path, metrics=None, sort_by="precision"
):
if metrics is None:
metrics = ["precision", "recall", "map"]
self.eval_path = eval_path
self.load_eval_queries = self.load_queries_from_csv()
self.query_templates = self.load_query_templates()
self.query_key_dict = self.create_query_key_dict()
self.metrics = metrics
self.sort_by = sort_by

# Define a dictionary to map metric names to their corresponding methods
self.metric_methods = {
"precision": self.add_precision,
"recall": self.add_recall,
"map": self.add_map
}

def run(self):
"""
Description: Run the evaluation process and display the results
Description: Load files, Run the evaluation process and display the results
"""
csv_files = self.load_result_files()
Expand All @@ -28,7 +40,7 @@ def run(self):

def load_result_files(self):
"""
Description: Find all the csv files in the evaluation directory
Description: Find all the csv files in the evaluation directory.
"""
return glob.glob(str(self.eval_path / "*/*/results.csv"))
Expand All @@ -42,9 +54,9 @@ def generate_results(self, csv_files):
for exp_path in tqdm(csv_files):
exp = pd.read_csv(exp_path).rename(columns={"did": "y_pred"})
exp["exp_folder_name"] = Path(exp_path).parent.name
exp["custom_experiement"] = ""
exp["custom_experiment"] = ""
# split exp_folder_name by @ to get extra information
exp["custom_experiement"] = exp["exp_folder_name"].apply(
exp["custom_experiment"] = exp["exp_folder_name"].apply(
lambda x: x.split("@")[0] if "@" in x else ""
)
exp.drop("exp_folder_name", axis=1, inplace=True)
Expand All @@ -56,24 +68,11 @@ def generate_results(self, csv_files):
"llm_model",
"query",
"llm_before_rag",
"custom_experiement",
"custom_experiment",
]
).agg({"y_true": ",".join, "y_pred": ",".join})

# add metrics
for metric in self.metrics:
if metric == "precision":
grouped_results_for_y_true_and_pred = self.add_precision(
grouped_results_for_y_true_and_pred
)
elif metric == "recall":
grouped_results_for_y_true_and_pred = self.add_recall(
grouped_results_for_y_true_and_pred
)
elif metric == "map":
grouped_results_for_y_true_and_pred = self.add_map(
grouped_results_for_y_true_and_pred
)
grouped_results_for_y_true_and_pred = self.add_metrics(grouped_results_for_y_true_and_pred)

# aggregate by computing the average of the metrics for each group
grouped_results_for_y_true_and_pred = (
Expand All @@ -82,7 +81,7 @@ def generate_results(self, csv_files):
"embedding_model",
"llm_model",
"llm_before_rag",
"custom_experiement",
"custom_experiment",
]
).agg({metric: "mean" for metric in self.metrics})
)
Expand All @@ -95,6 +94,16 @@ def generate_results(self, csv_files):
merged_df = merged_df.sort_values(by=self.sort_by, ascending=False)
return merged_df

def add_metrics(self, grouped_results_for_y_true_and_pred):
# Iterate over the metrics and apply the corresponding method if it exists
for metric in self.metrics:
if metric in self.metric_methods:
grouped_results_for_y_true_and_pred = self.metric_methods[metric](
grouped_results_for_y_true_and_pred
)

return grouped_results_for_y_true_and_pred

def load_queries_from_csv(self):
"""
Description: Load the queries from the csv file
Expand Down Expand Up @@ -137,7 +146,8 @@ def preprocess_results(self, results_df):
results_df["y_true"] = results_df["query"].map(self.query_key_dict)
return results_df

def add_precision(self, grouped_df):
@staticmethod
def add_precision(grouped_df):
"""
Description: Compute the precision metric for each group in the dataframe
"""
Expand All @@ -147,7 +157,8 @@ def add_precision(self, grouped_df):
]
return grouped_df

def add_recall(self, grouped_df):
@staticmethod
def add_recall(grouped_df):
"""
Description: Compute the recall metric for each group in the dataframe
Expand All @@ -158,7 +169,8 @@ def add_recall(self, grouped_df):
]
return grouped_df

def add_map(self, grouped_df):
@staticmethod
def add_map(grouped_df):
"""
Description: Compute the mean average precision metric for each group in the dataframe
"""
Expand All @@ -174,7 +186,8 @@ def add_map(self, grouped_df):
]
return grouped_df

def display_results(self, results_df):
@staticmethod
def display_results(results_df):
# add more preprocessing here
results_df = pd.DataFrame(results_df)
# heatmap results
Expand Down
Loading

0 comments on commit 4fd8f48

Please sign in to comment.