stanford-futuredata · mrTSB · Nov 15, 2024 · Nov 28, 2024 · Nov 28, 2024
diff --git a/.gitignore b/.gitignore
@@ -68,6 +68,9 @@ instance/
 
 # Scrapy stuff:
 .scrapy
+test.py
+tests/
+aresdocs/
 
 # Sphinx documentation
 docs/_build/

diff --git a/ares/LLM_as_a_Judge_Adaptation/Generate_Synthetic_Queries_and_Answers.py b/ares/LLM_as_a_Judge_Adaptation/Generate_Synthetic_Queries_and_Answers.py
@@ -492,6 +492,94 @@ def save_synthetic_queries(documents: pd.DataFrame, filename: str) -> None:
     documents.to_csv(filename, index=False, sep="\t")
     print("Saved synthetic queries to: " + filename)
 
+def query_decomposition_post_processing(synthetic_queries_filename: str):
+    # Read the synthetic queries from the specified file
+    synth_queries = pd.read_csv(synthetic_queries_filename, sep="\t")
+
+    # Drop any duplicated columns
+    synth_queries = synth_queries.loc[:, ~synth_queries.columns.duplicated()]
+
+    model_name = "google/flan-t5-xl"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+
+    decomposed_data = []
+    for _, row in synth_queries.iterrows():
+        # Decompose the complex query
+        simple_queries = decompose_query_with_model(row["Queries"], tokenizer, model)
+
+        # Add a new row for each decomposed query
+        for simple_query in simple_queries:
+            new_row = row.copy()
+            new_row["Queries"] = simple_query
+            decomposed_data.append(new_row)
+
+    # Replace the original processing loop with the batch processing function
+    synth_queries = pd.DataFrame(decomposed_data)
+
+    # Save the new synth queries to the file
+    save_synthetic_queries(synth_queries, synthetic_queries_filename)
+
+def decompose_query_with_model(query: str, tokenizer, model):
+    """
+    Provided a lightweight model decompose a given query into subqueries.
+
+    Parameters:
+    - query (str): The query to be decomposed.
+    - tokenizer (): Tokenizer for model
+    - model (model): LM used to process query decomposition
+
+    Returns:
+    - list: A list of resultant queries that are in the question.
+    """
+
+    input_text = f"""
+    You are an expert at decomposing questions. At the end of this prompt I have provided you a query.
+    This query could be decomposed into simple queries. It can be decomposed if the original query has multiple questions.
+    If there are not multiple questions then return the original query. If the original query has multiple questions
+    return the multiple questions in the format below. Be very cautious to not repeat any queries, this is very important.
+    There may be no simple queries and there may be many simple queries.
+
+    The output should be all the questions split by commas.
+    There should be no other information. Do not have double quotes either.
+
+    The following are examples of a complex query being decomposed into simple queries. 
+
+    Examples: 
+
+    Decompose: "What were Einstein’s key theories and how did they influence nuclear technology?"
+    - "What were Einstein’s key theories?"
+    - "How did Einstein's key theories influence nuclear technology?"
+
+    Decompose: "Explain the concept of quantum entanglement and its potential applications."
+    - "Explain the concept of quantum entanglement."
+    - "What are the potential applications of quantum entanglement?"
+
+    Decompose: "Describe the process of photosynthesis and its importance to the ecosystem."
+    - "What is the process of photosynthesis?"
+    - "Why is photosynthesis important to the ecosystem?"
+
+    Decompose: "How did the industrial revolution shape modern economies, influence technology, and society?"
+    - "How did the industrial revolution shape modern economies?"
+    - "How did the industrial revolution influence technology?"
+    - "How did the industrial revolution influence society?"
+
+    Decompose: "What is the first letter of the alphabet?"
+    - "What is the first letter of the alphabet?"
+
+    Decompose: "How many cows are in America?"
+    "How many cows are in America?"
+
+    Here is the query to decompose: {query}"""
+    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
+
+    # Generate the model output
+    outputs = model.generate(**inputs, max_length=128, num_return_sequences=1)
+
+    # Decode the generated text into a list of simple queries
+    simple_queries = [tokenizer.decode(output, skip_special_tokens=True).replace(' - ', ',').split(',') for output in outputs]
+    return simple_queries[0]
+
 def generate_synthetic_queries(documents: pd.DataFrame, settings: dict) -> pd.DataFrame:
     """
     Generate synthetic queries using the FLAN approach.

diff --git a/ares/LLM_as_a_Judge_Adaptation/LLM_Synthetic_Generation.py b/ares/LLM_as_a_Judge_Adaptation/LLM_Synthetic_Generation.py
@@ -261,6 +261,25 @@ def generate_synthetic_query_vllm_approach(document: str, synthetic_query_prompt
 def generate_synthetic_answer_api_approach(document: str, question: str, synthetic_answer_prompt: str, prompt: str, 
                                            length_of_fewshot_prompt: int, model_name: str, for_fever_dataset=False, 
                                            for_wow_dataset=False): 
+    """
+    Generates synthetic answers using a model's API based on the provided document and question.
+
+    This function constructs an answer dynamically using the api and model provided.
+
+    Args:
+        document (str): The document text based on which the contradictory answer is to be generated.
+        question (str): The question text based on the document.
+        synthetic_answer_prompt (str): The initial prompt text to which the document and question will be appended.
+        fewshot_examples (str): Few-shot examples to include in the prompt for the API.
+        api_url (str): The API endpoint URL.
+        api_key (str): The API key for authentication.
+        model_name (str): The model name to be used in the API.
+        for_fever_dataset (bool, optional): Flag to indicate if the function is being used for the FEVER dataset. Defaults to False.
+        for_wow_dataset (bool, optional): Flag to indicate if the function is being used for the WoW dataset. Defaults to False.
+
+    Returns:
+        str: The generated answer text.
+    """
     # Construct the prompt without the document based on the dataset type
     prompt_without_document = prompt + "Example " + str(length_of_fewshot_prompt + 1) + ":\n"
     if for_fever_dataset:
@@ -317,12 +336,31 @@ def generate_synthetic_answer_api_approach(document: str, question: str, synthet
 
             return final_response
         except Exception as e:
-            print(f"Error generating synthetic queries: {e}")
+            print(f"Error generating synthetic answers: {e}")
             continue
 
 def generate_synthetic_answer_azure_approach(document: str, question: str, synthetic_answer_prompt: str, prompt: str, 
                                            length_of_fewshot_prompt: int, azure_openai_config: dict, for_fever_dataset=False, 
                                            for_wow_dataset=False): 
+    """
+    Generates synthetic answers using provided Azure OpenAI model based on the provided document and question.
+
+    This function constructs an answer dynamically using the api and model provided.
+
+    Args:
+        document (str): The document text based on which the contradictory answer is to be generated.
+        question (str): The question text based on the document.
+        synthetic_answer_prompt (str): The initial prompt text to which the document and question will be appended.
+        fewshot_examples (str): Few-shot examples to include in the prompt for the API.
+        api_url (str): The API endpoint URL.
+        api_key (str): The API key for authentication.
+        model_name (str): The model name to be used in the API.
+        for_fever_dataset (bool, optional): Flag to indicate if the function is being used for the FEVER dataset. Defaults to False.
+        for_wow_dataset (bool, optional): Flag to indicate if the function is being used for the WoW dataset. Defaults to False.
+
+    Returns:
+        str: The generated answer text.
+    """
     # Construct the prompt without the document based on the dataset type
     prompt_without_document = prompt + "Example " + str(length_of_fewshot_prompt + 1) + ":\n"
     if for_fever_dataset:
@@ -382,12 +420,31 @@ def generate_synthetic_answer_azure_approach(document: str, question: str, synth
 
             return final_response
         except Exception as e:
-            print(f"Error generating synthetic queries: {e}")
+            print(f"Error generating synthetic answers: {e}")
             continue
 
 def generate_synthetic_answer_vllm_approach(document: str, question: str, synthetic_answer_prompt: str, prompt: str, 
                                            length_of_fewshot_prompt: int, model_name: str, host_url: str, for_fever_dataset=False, 
-                                           for_wow_dataset=False): 
+                                           for_wow_dataset=False):
+    """
+    Generates synthetic answers using an API model based on the provided document and question.
+
+    This function constructs an answer dynamically using the vllm provided at the host url.
+
+    Args:
+        document (str): The document text based on which the contradictory answer is to be generated.
+        question (str): The question text based on the document.
+        synthetic_answer_prompt (str): The initial prompt text to which the document and question will be appended.
+        fewshot_examples (str): Few-shot examples to include in the prompt for the API.
+        api_url (str): The API endpoint URL.
+        api_key (str): The API key for authentication.
+        model_name (str): The model name to be used in the API.
+        for_fever_dataset (bool, optional): Flag to indicate if the function is being used for the FEVER dataset. Defaults to False.
+        for_wow_dataset (bool, optional): Flag to indicate if the function is being used for the WoW dataset. Defaults to False.
+
+    Returns:
+        str: The generated answer text.
+    """
     # Construct the prompt without the document based on the dataset type
     prompt_without_document = prompt + "Example " + str(length_of_fewshot_prompt + 1) + ":\n"
     if for_fever_dataset:
@@ -446,7 +503,7 @@ def generate_synthetic_answer_vllm_approach(document: str, question: str, synthe
 
             return final_response
         except Exception as e:
-            print(f"Error generating synthetic queries: {e}")
+            print(f"Error generating synthetic answers: {e}")
             continue
 
 def generate_synthetic_contradictory_answers_api_approach(document: str, question: str, synthetic_contradictory_answer_prompt: str, fewshot_examples: str, 

diff --git a/ares/RAG_Automatic_Evaluation/LLMJudge_RAG_Compared_Scoring.py b/ares/RAG_Automatic_Evaluation/LLMJudge_RAG_Compared_Scoring.py
@@ -3,9 +3,10 @@
     T5Tokenizer, T5EncoderModel, T5ForConditionalGeneration, 
     BertModel, AutoTokenizer, AutoModel, GPT2Tokenizer, 
     TrainingArguments, get_scheduler, 
-    AutoModelForCausalLM, AutoConfig, AutoModelForSequenceClassification, 
-    MptForSequenceClassification
+    AutoModelForCausalLM, AutoConfig, AutoModelForSequenceClassification,
+    MptForSequenceClassification, AutoModelForSeq2SeqLM
 )
+
 import sys
 import pandas as pd
 import numpy as np
@@ -449,14 +450,15 @@ def filter_dataset(rag_type: str = "question_answering") -> tuple[str, str, str]
 
     return context_relevance_system_prompt, answer_faithfulness_system_prompt, answer_relevance_system_prompt
 
-def preprocess_data(test_set_selection: str, label_column: str, labels: list):
+def preprocess_data(test_set_selection: str, label_column: str, labels: list, query_decomposition: bool):
     """
     Preprocesses the data for evaluation.
 
     Parameters:
     - test_set_selection (str): The file path to the test set selection in CSV format.
     - label_column (str): The column name in the test set that contains the labels.
     - labels (list): A list of labels to be used for filtering the test set.
+    - query_decomposition (bool): When processing the data do we want to decompose queries?
 
     Returns:
     - Tuple[pd.DataFrame, str]: A tuple containing the preprocessed test set DataFrame and the name of the text column.
@@ -488,15 +490,95 @@ def preprocess_data(test_set_selection: str, label_column: str, labels: list):
 
     # Filter out rows where the text column has the value "Error"
     test_set = test_set[test_set[text_column] != "Error"]
+
 
     # Check if the dataset has fewer than 10 rows after filtering
     if len(test_set) < 10:
         raise ValueError("Insufficient Data: Dataset has fewer than 10 rows after filtering!")
+
+    if query_decomposition:
+        model_name = "google/flan-t5-xl"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+
+        decomposed_data = []
+        for _, row in df.iterrows():
+            # Decompose the complex query
+            simple_queries = decompose_query_with_model(row["Queries"], tokenizer, model)
+
+            # Add a new row for each decomposed query
+            for simple_query in simple_queries:
+                new_row = row.copy()
+                new_row["Queries"] = simple_query
+                decomposed_data.append(new_row)
+
+        # Replace the original processing loop with the batch processing function
+        test_set = pd.DataFrame(decomposed_data)
 
     return test_set, text_column
 
         ############################################################
 
+def decompose_query_with_model(query, tokenizer, model):
+    """
+    Provided a lightweight model decompose a given query into subqueries.
+
+    Parameters:
+    - query (str): The query to be decomposed.
+    - tokenizer (): Tokenizer for model
+    - model (model): LM used to process query decomposition
+
+    Returns:
+    - list: A list of resultant queries that are in the question.
+    """
+
+    input_text = f"""
+    You are an expert at decomposing questions. At the end of this prompt I have provided you a query.
+    This query could be decomposed into simple queries. It can be decomposed if the original query has multiple questions.
+    If there are not multiple questions then return the original query. If the original query has multiple questions
+    return the multiple questions in the format below. Be very cautious to not repeat any queries, this is very important.
+    There may be no simple queries and there may be many simple queries.
+
+    The output should be all the questions split by commas.
+    There should be no other information. Do not have double quotes either.
+
+    The following are examples of a complex query being decomposed into simple queries. 
+
+    Examples: 
+
+    Decompose: "What were Einstein’s key theories and how did they influence nuclear technology?"
+    - "What were Einstein’s key theories?"
+    - "How did Einstein's key theories influence nuclear technology?"
+
+    Decompose: "Explain the concept of quantum entanglement and its potential applications."
+    - "Explain the concept of quantum entanglement."
+    - "What are the potential applications of quantum entanglement?"
+
+    Decompose: "Describe the process of photosynthesis and its importance to the ecosystem."
+    - "What is the process of photosynthesis?"
+    - "Why is photosynthesis important to the ecosystem?"
+
+    Decompose: "How did the industrial revolution shape modern economies, influence technology, and society?"
+    - "How did the industrial revolution shape modern economies?"
+    - "How did the industrial revolution influence technology?"
+    - "How did the industrial revolution influence society?"
+
+    Decompose: "What is the first letter of the alphabet?"
+    - "What is the first letter of the alphabet?"
+
+    Decompose: "How many cows are in America?"
+    "How many cows are in America?"
+
+    Here is the query to decompose: {query}"""
+    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
+
+    # Generate the model output
+    outputs = model.generate(**inputs, max_length=128, num_return_sequences=1)
+
+    # Decode the generated text into a list of simple queries
+    simple_queries = [tokenizer.decode(output, skip_special_tokens=True).replace(' - ', ',').split(',') for output in outputs]
+    return simple_queries[0]
+
 def togetherai_list_models(api_key: str) -> list:
     """
     Lists available models from the Together API.

diff --git a/ares/ares.py b/ares/ares.py
@@ -34,10 +34,11 @@ class ARES:
             "request_delay": (int, 0),  # Optional with default
             "vllm": (bool, False),  # Optional with default
             "azure_openai_config": (dict, None),  # Optional with default
-            "host_url": (str, "None")  # Optional with default
+            "host_url": (str, "None"),  # Optional with default
         },
 
         "synthetic_query_generator": {
+            "query_decomposition": (bool, False), # Optional with default
             "document_filepaths": (list, None),  # Required parameter with no default value
             "few_shot_prompt_filenames": (list, None),  # Required parameter with no default value
             "synthetic_queries_filenames": (list, None),  # Required parameter with no default value
@@ -60,7 +61,7 @@ class ARES:
             "synthetic_query_prompt": (str, "You are an expert question-answering system. Generate only one question based on the provided document. Ensure the question is answerable within the context of the document. Do not generate multiple questions. Do not provide labels, headers, or additional text. Only return a single, clear question. Generating more than one question will be considered incorrect output.\n\n"),
             "synthetic_valid_answer_prompt": (str, "You are an expert question-answering system. You must create an answer for the provided question. The answer must be answerable within the context of the document. Return only the answer, nothing else.\n\n"),
             "synthetic_contradictory_answer_prompt": (str, "Create an answer for the given question that contradicts the provided document. You should create false information that disagrees with what exists within the content of the document.  Return only the false answer, without any labels or additional text.\n\n"),
-            "azure_openai_config": (dict, None)  # Optional with default
+            "azure_openai_config": (dict, None),  # Optional with default
         },
 
         "classifier_model": {
@@ -79,7 +80,7 @@ class ARES:
             "number_of_runs": (int, 1),  # Optional with default
             "num_warmup_steps": (int, 100),  # Optional with default
             "training_row_limit": (int, -1),  # Optional with default
-            "validation_row_limit": (int, -1)  # Optional with default
+            "validation_row_limit": (int, -1),  # Optional with default
         },
 
         "ppi": {
@@ -103,7 +104,8 @@ class ARES:
             "machine_label_llm_model": (str, "None"),  # Optional with default
             "gold_machine_label_path": (str, "None"),  # Optional with default
             "prediction_filepaths": (list, ["None"]),  # Optional with default
-            "azure_openai_config": (dict, None)  # Optional with default
+            "azure_openai_config": (dict, None),  # Optional with default
+            "query_decomposition": (bool, False), # Optional with default
         }
 
         # azure_openai_config dictionary is formatted with 3 entries: