Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New Feature: Query Decomposition, Small fixes: Fix comments & Errors printed #76

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ instance/

# Scrapy stuff:
.scrapy
test.py
tests/
aresdocs/

# Sphinx documentation
docs/_build/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,94 @@ def save_synthetic_queries(documents: pd.DataFrame, filename: str) -> None:
documents.to_csv(filename, index=False, sep="\t")
print("Saved synthetic queries to: " + filename)

def query_decomposition_post_processing(synthetic_queries_filename: str):
# Read the synthetic queries from the specified file
synth_queries = pd.read_csv(synthetic_queries_filename, sep="\t")

# Drop any duplicated columns
synth_queries = synth_queries.loc[:, ~synth_queries.columns.duplicated()]

model_name = "google/flan-t5-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

decomposed_data = []
for _, row in synth_queries.iterrows():
# Decompose the complex query
simple_queries = decompose_query_with_model(row["Queries"], tokenizer, model)

# Add a new row for each decomposed query
for simple_query in simple_queries:
new_row = row.copy()
new_row["Queries"] = simple_query
decomposed_data.append(new_row)

# Replace the original processing loop with the batch processing function
synth_queries = pd.DataFrame(decomposed_data)

# Save the new synth queries to the file
save_synthetic_queries(synth_queries, synthetic_queries_filename)

def decompose_query_with_model(query: str, tokenizer, model):
"""
Provided a lightweight model decompose a given query into subqueries.

Parameters:
- query (str): The query to be decomposed.
- tokenizer (): Tokenizer for model
- model (model): LM used to process query decomposition

Returns:
- list: A list of resultant queries that are in the question.
"""

input_text = f"""
You are an expert at decomposing questions. At the end of this prompt I have provided you a query.
This query could be decomposed into simple queries. It can be decomposed if the original query has multiple questions.
If there are not multiple questions then return the original query. If the original query has multiple questions
return the multiple questions in the format below. Be very cautious to not repeat any queries, this is very important.
There may be no simple queries and there may be many simple queries.

The output should be all the questions split by commas.
There should be no other information. Do not have double quotes either.

The following are examples of a complex query being decomposed into simple queries.

Examples:

Decompose: "What were Einstein’s key theories and how did they influence nuclear technology?"
- "What were Einstein’s key theories?"
- "How did Einstein's key theories influence nuclear technology?"

Decompose: "Explain the concept of quantum entanglement and its potential applications."
- "Explain the concept of quantum entanglement."
- "What are the potential applications of quantum entanglement?"

Decompose: "Describe the process of photosynthesis and its importance to the ecosystem."
- "What is the process of photosynthesis?"
- "Why is photosynthesis important to the ecosystem?"

Decompose: "How did the industrial revolution shape modern economies, influence technology, and society?"
- "How did the industrial revolution shape modern economies?"
- "How did the industrial revolution influence technology?"
- "How did the industrial revolution influence society?"

Decompose: "What is the first letter of the alphabet?"
- "What is the first letter of the alphabet?"

Decompose: "How many cows are in America?"
"How many cows are in America?"

Here is the query to decompose: {query}"""
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)

# Generate the model output
outputs = model.generate(**inputs, max_length=128, num_return_sequences=1)

# Decode the generated text into a list of simple queries
simple_queries = [tokenizer.decode(output, skip_special_tokens=True).replace(' - ', ',').split(',') for output in outputs]
return simple_queries[0]

def generate_synthetic_queries(documents: pd.DataFrame, settings: dict) -> pd.DataFrame:
"""
Generate synthetic queries using the FLAN approach.
Expand Down
65 changes: 61 additions & 4 deletions ares/LLM_as_a_Judge_Adaptation/LLM_Synthetic_Generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,25 @@ def generate_synthetic_query_vllm_approach(document: str, synthetic_query_prompt
def generate_synthetic_answer_api_approach(document: str, question: str, synthetic_answer_prompt: str, prompt: str,
length_of_fewshot_prompt: int, model_name: str, for_fever_dataset=False,
for_wow_dataset=False):
"""
Generates synthetic answers using a model's API based on the provided document and question.

This function constructs an answer dynamically using the api and model provided.

Args:
document (str): The document text based on which the contradictory answer is to be generated.
question (str): The question text based on the document.
synthetic_answer_prompt (str): The initial prompt text to which the document and question will be appended.
fewshot_examples (str): Few-shot examples to include in the prompt for the API.
api_url (str): The API endpoint URL.
api_key (str): The API key for authentication.
model_name (str): The model name to be used in the API.
for_fever_dataset (bool, optional): Flag to indicate if the function is being used for the FEVER dataset. Defaults to False.
for_wow_dataset (bool, optional): Flag to indicate if the function is being used for the WoW dataset. Defaults to False.

Returns:
str: The generated answer text.
"""
# Construct the prompt without the document based on the dataset type
prompt_without_document = prompt + "Example " + str(length_of_fewshot_prompt + 1) + ":\n"
if for_fever_dataset:
Expand Down Expand Up @@ -317,12 +336,31 @@ def generate_synthetic_answer_api_approach(document: str, question: str, synthet

return final_response
except Exception as e:
print(f"Error generating synthetic queries: {e}")
print(f"Error generating synthetic answers: {e}")
continue

def generate_synthetic_answer_azure_approach(document: str, question: str, synthetic_answer_prompt: str, prompt: str,
length_of_fewshot_prompt: int, azure_openai_config: dict, for_fever_dataset=False,
for_wow_dataset=False):
"""
Generates synthetic answers using provided Azure OpenAI model based on the provided document and question.

This function constructs an answer dynamically using the api and model provided.

Args:
document (str): The document text based on which the contradictory answer is to be generated.
question (str): The question text based on the document.
synthetic_answer_prompt (str): The initial prompt text to which the document and question will be appended.
fewshot_examples (str): Few-shot examples to include in the prompt for the API.
api_url (str): The API endpoint URL.
api_key (str): The API key for authentication.
model_name (str): The model name to be used in the API.
for_fever_dataset (bool, optional): Flag to indicate if the function is being used for the FEVER dataset. Defaults to False.
for_wow_dataset (bool, optional): Flag to indicate if the function is being used for the WoW dataset. Defaults to False.

Returns:
str: The generated answer text.
"""
# Construct the prompt without the document based on the dataset type
prompt_without_document = prompt + "Example " + str(length_of_fewshot_prompt + 1) + ":\n"
if for_fever_dataset:
Expand Down Expand Up @@ -382,12 +420,31 @@ def generate_synthetic_answer_azure_approach(document: str, question: str, synth

return final_response
except Exception as e:
print(f"Error generating synthetic queries: {e}")
print(f"Error generating synthetic answers: {e}")
continue

def generate_synthetic_answer_vllm_approach(document: str, question: str, synthetic_answer_prompt: str, prompt: str,
length_of_fewshot_prompt: int, model_name: str, host_url: str, for_fever_dataset=False,
for_wow_dataset=False):
for_wow_dataset=False):
"""
Generates synthetic answers using an API model based on the provided document and question.

This function constructs an answer dynamically using the vllm provided at the host url.

Args:
document (str): The document text based on which the contradictory answer is to be generated.
question (str): The question text based on the document.
synthetic_answer_prompt (str): The initial prompt text to which the document and question will be appended.
fewshot_examples (str): Few-shot examples to include in the prompt for the API.
api_url (str): The API endpoint URL.
api_key (str): The API key for authentication.
model_name (str): The model name to be used in the API.
for_fever_dataset (bool, optional): Flag to indicate if the function is being used for the FEVER dataset. Defaults to False.
for_wow_dataset (bool, optional): Flag to indicate if the function is being used for the WoW dataset. Defaults to False.

Returns:
str: The generated answer text.
"""
# Construct the prompt without the document based on the dataset type
prompt_without_document = prompt + "Example " + str(length_of_fewshot_prompt + 1) + ":\n"
if for_fever_dataset:
Expand Down Expand Up @@ -446,7 +503,7 @@ def generate_synthetic_answer_vllm_approach(document: str, question: str, synthe

return final_response
except Exception as e:
print(f"Error generating synthetic queries: {e}")
print(f"Error generating synthetic answers: {e}")
continue

def generate_synthetic_contradictory_answers_api_approach(document: str, question: str, synthetic_contradictory_answer_prompt: str, fewshot_examples: str,
Expand Down
88 changes: 85 additions & 3 deletions ares/RAG_Automatic_Evaluation/LLMJudge_RAG_Compared_Scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
T5Tokenizer, T5EncoderModel, T5ForConditionalGeneration,
BertModel, AutoTokenizer, AutoModel, GPT2Tokenizer,
TrainingArguments, get_scheduler,
AutoModelForCausalLM, AutoConfig, AutoModelForSequenceClassification,
MptForSequenceClassification
AutoModelForCausalLM, AutoConfig, AutoModelForSequenceClassification,
MptForSequenceClassification, AutoModelForSeq2SeqLM
)

import sys
import pandas as pd
import numpy as np
Expand Down Expand Up @@ -449,14 +450,15 @@ def filter_dataset(rag_type: str = "question_answering") -> tuple[str, str, str]

return context_relevance_system_prompt, answer_faithfulness_system_prompt, answer_relevance_system_prompt

def preprocess_data(test_set_selection: str, label_column: str, labels: list):
def preprocess_data(test_set_selection: str, label_column: str, labels: list, query_decomposition: bool):
"""
Preprocesses the data for evaluation.

Parameters:
- test_set_selection (str): The file path to the test set selection in CSV format.
- label_column (str): The column name in the test set that contains the labels.
- labels (list): A list of labels to be used for filtering the test set.
- query_decomposition (bool): When processing the data do we want to decompose queries?

Returns:
- Tuple[pd.DataFrame, str]: A tuple containing the preprocessed test set DataFrame and the name of the text column.
Expand Down Expand Up @@ -488,15 +490,95 @@ def preprocess_data(test_set_selection: str, label_column: str, labels: list):

# Filter out rows where the text column has the value "Error"
test_set = test_set[test_set[text_column] != "Error"]


# Check if the dataset has fewer than 10 rows after filtering
if len(test_set) < 10:
raise ValueError("Insufficient Data: Dataset has fewer than 10 rows after filtering!")

if query_decomposition:
model_name = "google/flan-t5-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

decomposed_data = []
for _, row in df.iterrows():
# Decompose the complex query
simple_queries = decompose_query_with_model(row["Queries"], tokenizer, model)

# Add a new row for each decomposed query
for simple_query in simple_queries:
new_row = row.copy()
new_row["Queries"] = simple_query
decomposed_data.append(new_row)

# Replace the original processing loop with the batch processing function
test_set = pd.DataFrame(decomposed_data)

return test_set, text_column

############################################################

def decompose_query_with_model(query, tokenizer, model):
"""
Provided a lightweight model decompose a given query into subqueries.

Parameters:
- query (str): The query to be decomposed.
- tokenizer (): Tokenizer for model
- model (model): LM used to process query decomposition

Returns:
- list: A list of resultant queries that are in the question.
"""

input_text = f"""
You are an expert at decomposing questions. At the end of this prompt I have provided you a query.
This query could be decomposed into simple queries. It can be decomposed if the original query has multiple questions.
If there are not multiple questions then return the original query. If the original query has multiple questions
return the multiple questions in the format below. Be very cautious to not repeat any queries, this is very important.
There may be no simple queries and there may be many simple queries.

The output should be all the questions split by commas.
There should be no other information. Do not have double quotes either.

The following are examples of a complex query being decomposed into simple queries.

Examples:

Decompose: "What were Einstein’s key theories and how did they influence nuclear technology?"
- "What were Einstein’s key theories?"
- "How did Einstein's key theories influence nuclear technology?"

Decompose: "Explain the concept of quantum entanglement and its potential applications."
- "Explain the concept of quantum entanglement."
- "What are the potential applications of quantum entanglement?"

Decompose: "Describe the process of photosynthesis and its importance to the ecosystem."
- "What is the process of photosynthesis?"
- "Why is photosynthesis important to the ecosystem?"

Decompose: "How did the industrial revolution shape modern economies, influence technology, and society?"
- "How did the industrial revolution shape modern economies?"
- "How did the industrial revolution influence technology?"
- "How did the industrial revolution influence society?"

Decompose: "What is the first letter of the alphabet?"
- "What is the first letter of the alphabet?"

Decompose: "How many cows are in America?"
"How many cows are in America?"

Here is the query to decompose: {query}"""
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)

# Generate the model output
outputs = model.generate(**inputs, max_length=128, num_return_sequences=1)

# Decode the generated text into a list of simple queries
simple_queries = [tokenizer.decode(output, skip_special_tokens=True).replace(' - ', ',').split(',') for output in outputs]
return simple_queries[0]

def togetherai_list_models(api_key: str) -> list:
"""
Lists available models from the Together API.
Expand Down
10 changes: 6 additions & 4 deletions ares/ares.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,11 @@ class ARES:
"request_delay": (int, 0), # Optional with default
"vllm": (bool, False), # Optional with default
"azure_openai_config": (dict, None), # Optional with default
"host_url": (str, "None") # Optional with default
"host_url": (str, "None"), # Optional with default
},

"synthetic_query_generator": {
"query_decomposition": (bool, False), # Optional with default
"document_filepaths": (list, None), # Required parameter with no default value
"few_shot_prompt_filenames": (list, None), # Required parameter with no default value
"synthetic_queries_filenames": (list, None), # Required parameter with no default value
Expand All @@ -60,7 +61,7 @@ class ARES:
"synthetic_query_prompt": (str, "You are an expert question-answering system. Generate only one question based on the provided document. Ensure the question is answerable within the context of the document. Do not generate multiple questions. Do not provide labels, headers, or additional text. Only return a single, clear question. Generating more than one question will be considered incorrect output.\n\n"),
"synthetic_valid_answer_prompt": (str, "You are an expert question-answering system. You must create an answer for the provided question. The answer must be answerable within the context of the document. Return only the answer, nothing else.\n\n"),
"synthetic_contradictory_answer_prompt": (str, "Create an answer for the given question that contradicts the provided document. You should create false information that disagrees with what exists within the content of the document. Return only the false answer, without any labels or additional text.\n\n"),
"azure_openai_config": (dict, None) # Optional with default
"azure_openai_config": (dict, None), # Optional with default
},

"classifier_model": {
Expand All @@ -79,7 +80,7 @@ class ARES:
"number_of_runs": (int, 1), # Optional with default
"num_warmup_steps": (int, 100), # Optional with default
"training_row_limit": (int, -1), # Optional with default
"validation_row_limit": (int, -1) # Optional with default
"validation_row_limit": (int, -1), # Optional with default
},

"ppi": {
Expand All @@ -103,7 +104,8 @@ class ARES:
"machine_label_llm_model": (str, "None"), # Optional with default
"gold_machine_label_path": (str, "None"), # Optional with default
"prediction_filepaths": (list, ["None"]), # Optional with default
"azure_openai_config": (dict, None) # Optional with default
"azure_openai_config": (dict, None), # Optional with default
"query_decomposition": (bool, False), # Optional with default
}

# azure_openai_config dictionary is formatted with 3 entries:
Expand Down
Loading