test: add facts grounding prompts

MoritzLaurer · Jan 2, 2025 · d97fbdc · d97fbdc
1 parent 8d52847
commit d97fbdc
Show file tree

Hide file tree

Showing 8 changed files with 478 additions and 0 deletions.
diff --git a/tests/test_data/facts_grounding_prompts/grounding_accuracy_implicit_span_level.yaml b/tests/test_data/facts_grounding_prompts/grounding_accuracy_implicit_span_level.yaml
@@ -0,0 +1,52 @@
+prompt:
+  template: |-
+    Your task is to check if the Response is accurate to the Evidence.
+    Generate 'Accurate' if the Response is accurate when verified according to the Evidence, or 'Inaccurate' if the Response is inaccurate (contradicts the evidence) or cannot be verified.
+
+    **Query**:
+
+    {{user_request}}
+
+    **End of Query**
+
+    **Evidence**
+
+    {{context_document}}
+
+    **End of Evidence**
+
+    **Response**:
+
+    {{response}}
+
+    **End of Response**
+
+
+    Break down the Response into sentences and classify each one separately, then give the final answer: If even one of the sentences is inaccurate, then the Response is inaccurate.
+
+    For example, your output should be of this format:
+    Sentence 1: <Sentence 1>
+    Sentence 1 label: Accurate/Inaccurate (choose 1)
+    Sentence 2: <Sentence 2>
+    Sentence 2 label: Accurate/Inaccurate (choose 1)
+    Sentence 3: <Sentence 3>
+    Sentence 3 label: Accurate/Inaccurate (choose 1)
+    [...]
+    Final Answer: Accurate/Inaccurate (choose 1)
+  template_variables:
+    - user_request
+    - context_document
+    - response
+  metadata:
+    description: "An evaluation prompt from the paper 'The FACTS Grounding Leaderboard: Benchmarking LLMs’ Ability to Ground
+      Responses to Long-Form Input' by Google DeepMind.\n    The prompt was copied from the evaluation_prompts.csv file from
+      Kaggle.\n    This specific prompt elicits a binary accurate/non-accurate classifier for the entire response after generating
+      and classifying each sentence separately."
+    evaluation_method: implicit_span_level
+    tags:
+      - fact-checking
+    version: 1.0.0
+    author: Google DeepMind
+    source: https://www.kaggle.com/datasets/deepmind/FACTS-grounding-examples?resource=download&select=evaluation_prompts.csv
+  client_parameters: {}
+  custom_data: {}
diff --git a/tests/test_data/facts_grounding_prompts/grounding_accuracy_response_level.yaml b/tests/test_data/facts_grounding_prompts/grounding_accuracy_response_level.yaml
@@ -0,0 +1,40 @@
+prompt:
+  template: |-
+    Your task is to check if the Response is accurate to the Evidence.
+    Generate 'Accurate' if the Response is accurate when verified according to the Evidence, or 'Inaccurate' if the Response is inaccurate (contradicts the evidence) or cannot be verified.
+
+    **Query**:
+
+    {{user_request}}
+
+    **End of Query**
+
+    **Evidence**
+
+    {{context_document}}
+
+    **End of Evidence**
+
+    **Response**:
+
+    {{response}}
+
+    **End of Response**
+
+    Let's think step-by-step.
+  template_variables:
+    - user_request
+    - context_document
+    - response
+  metadata:
+    description: "An evaluation prompt from the paper 'The FACTS Grounding Leaderboard: Benchmarking LLMs’ Ability to Ground
+      Responses to Long-Form Input' by Google DeepMind.\n    The prompt was copied from the evaluation_prompts.csv file from
+      Kaggle.\n    This specific prompt elicits a binary accurate/inaccurate classifier for the entire response."
+    evaluation_method: response_level
+    tags:
+      - fact-checking
+    version: 1.0.0
+    author: Google DeepMind
+    source: https://www.kaggle.com/datasets/deepmind/FACTS-grounding-examples?resource=download&select=evaluation_prompts.csv
+  client_parameters: {}
+  custom_data: {}
diff --git a/tests/test_data/facts_grounding_prompts/grounding_accuracy_span_level.yaml b/tests/test_data/facts_grounding_prompts/grounding_accuracy_span_level.yaml
@@ -0,0 +1,52 @@
+prompt:
+  template: |-
+    Your task is to check if a specific Span is accurate to the Evidence.
+    Generate 'Accurate' if the Span is accurate when verified according to the Evidence or when there is nothing to verify in the Span.
+    Generate 'Inaccurate' if the Span is inaccurate (contradicts the evidence), or cannot be verified.
+
+    **Query**:
+
+    {{user_request}}
+
+    **End of Query**
+
+    **Evidence**
+
+    {{context_document}}
+
+    **End of Evidence**
+
+    **Response**:
+
+    {{response}}
+
+    **End of Response**
+
+
+    You are currently verifying **Span {{ix+1}}** from the Response.
+    **Span {{ix+1}}**:
+
+    {{span}}
+
+    **End of Span {{ix+1}}**
+
+
+    Is Span {{ix+1}} accurate or inaccurate when verified according to the Evidence? Point to where in the evidence justifies your answer.
+  template_variables:
+    - user_request
+    - context_document
+    - response
+    - ix
+    - span
+  metadata:
+    description: "An evaluation prompt from the paper 'The FACTS Grounding Leaderboard: Benchmarking LLMs’ Ability to Ground
+      Responses to Long-Form Input' by Google DeepMind.\n    The prompt was copied from the evaluation_prompts.csv file from
+      Kaggle.\n    This specific prompt elicits a binary accurate/non-accurate classifier on a span level."
+    evaluation_method: span_level
+    tags:
+      - fact-checking
+    version: 1.0.0
+    author: Google DeepMind
+    source: https://www.kaggle.com/datasets/deepmind/FACTS-grounding-examples?resource=download&select=evaluation_prompts.csv
+  client_parameters: {}
+  custom_data: {}
diff --git a/tests/test_data/facts_grounding_prompts/grounding_nli_json.yaml b/tests/test_data/facts_grounding_prompts/grounding_nli_json.yaml
@@ -0,0 +1,77 @@
+prompt:
+  template: |-
+    You are a helpful and harmless AI assistant. You will be provided with a textual context and a model-generated response.
+    Your task is to analyze the response sentence by sentence and classify each sentence according to its relationship with the provided context.
+
+    **Instructions:**
+
+    1. **Decompose the response into individual sentences.**
+    2. **For each sentence, assign one of the following labels:**
+        * **`supported`**: The sentence is entailed by the given context.  Provide a supporting excerpt from the context. The supporting except must *fully* entail the sentence. If you need to cite multiple supporting excepts, simply concatenate them.
+        * **`unsupported`**: The sentence is not entailed by the given context. No excerpt is needed for this label.
+        * **`contradictory`**: The sentence is falsified by the given context. Provide a contradicting excerpt from the context.
+        * **`no_rad`**: The sentence does not require factual attribution (e.g., opinions, greetings, questions, disclaimers).  No excerpt is needed for this label.
+    3. **For each label, provide a short rationale explaining your decision.**  The rationale should be separate from the excerpt.
+    4. **Be very strict with your `supported` and `contradictory` decisions.** Unless you can find straightforward, indisputable evidence excerpts *in the context* that a sentence is `supported` or `contradictory`, consider it `unsupported`. You should not employ world knowledge unless it is truly trivial.
+
+    **Input Format:**
+
+    The input will consist of two parts, clearly separated:
+
+    * **Context:**  The textual context used to generate the response.
+    * **Response:** The model-generated response to be analyzed.
+
+    **Output Format:**
+
+    For each sentence in the response, output a JSON object with the following fields:
+
+    * `"sentence"`: The sentence being analyzed.
+    * `"label"`: One of `supported`, `unsupported`, `contradictory`, or `no_rad`.
+    * `"rationale"`: A brief explanation for the assigned label.
+    * `"excerpt"`:  A relevant excerpt from the context. Only required for `supported` and `contradictory` labels.
+
+    Output each JSON object on a new line.
+
+    **Example:**
+
+    **Input:**
+
+    ```
+    Context: Apples are red fruits. Bananas are yellow fruits.
+
+    Response: Apples are red. Bananas are green. Bananas are cheaper than apples. Enjoy your fruit!
+    ```
+
+    **Output:**
+
+    {"sentence": "Apples are red.", "label": "supported", "rationale": "The context explicitly states that apples are red.", "excerpt": "Apples are red fruits."}
+    {"sentence": "Bananas are green.", "label": "contradictory", "rationale": "The context states that bananas are yellow, not green.", "excerpt": "Bananas are yellow fruits."}
+    {"sentence": "Bananas are cheaper than apples.", "label": "unsupported", "rationale": "The context does not mention the price of bananas or apples.", "excerpt": null}
+    {"sentence": "Enjoy your fruit!", "label": "no_rad", "rationale": "This is a general expression and does not require factual attribution.", "excerpt": null}
+
+    **Now, please analyze the following context and response:**
+
+    **User Query:**
+    {{user_request}}
+
+    **Context:**
+    {{context_document}}
+
+    **Response:**
+    {{response}}
+  template_variables:
+    - user_request
+    - context_document
+    - response
+  metadata:
+    description: "An evaluation prompt from the paper 'The FACTS Grounding Leaderboard: Benchmarking LLMs’ Ability to Ground
+      Responses to Long-Form Input' by Google DeepMind.\n    The prompt was copied from the evaluation_prompts.csv file from
+      Kaggle.\n    This specific prompt elicits an NLI-style sentence-by-sentence checker outputting JSON for each sentence."
+    evaluation_method: json
+    tags:
+      - fact-checking
+    version: 1.0.0
+    author: Google DeepMind
+    source: https://www.kaggle.com/datasets/deepmind/FACTS-grounding-examples?resource=download&select=evaluation_prompts.csv
+  client_parameters: {}
+  custom_data: {}
diff --git a/tests/test_data/facts_grounding_prompts/grounding_nli_json_alt.yaml b/tests/test_data/facts_grounding_prompts/grounding_nli_json_alt.yaml
@@ -0,0 +1,76 @@
+prompt:
+  template: |-
+    You are a helpful and harmless AI assistant. You will be provided with a textual context and a model-generated response.
+    Your task is to analyze the response sentence by sentence and classify each sentence according to its relationship with the provided context.
+
+    **Instructions:**
+
+    1. **Decompose the response into individual sentences.**
+    2. **For each sentence, assign one of the following labels:**
+        * **`supported`**: The sentence is entailed by the given context.  Provide a supporting excerpt from the context.
+        * **`unsupported`**: The sentence is not entailed by the given context. Provide an excerpt that is close but does not fully support the sentence.
+        * **`contradictory`**: The sentence is falsified by the given context. Provide a contradicting excerpt from the context.
+        * **`no_rad`**: The sentence does not require factual attribution (e.g., opinions, greetings, questions, disclaimers).  No excerpt is needed for this label.
+
+    3. **For each label, provide a short rationale explaining your decision.**  The rationale should be separate from the excerpt.
+
+    **Input Format:**
+
+    The input will consist of two parts, clearly separated:
+
+    * **Context:**  The textual context used to generate the response.
+    * **Response:** The model-generated response to be analyzed.
+
+    **Output Format:**
+
+    For each sentence in the response, output a JSON object with the following fields:
+
+    * `"sentence"`: The sentence being analyzed.
+    * `"label"`: One of `supported`, `unsupported`, `contradictory`, or `no_rad`.
+    * `"rationale"`: A brief explanation for the assigned label.
+    * `"excerpt"`:  A relevant excerpt from the context. Only required for `supported`, `unsupported`, and `contradictory` labels.
+
+    Output each JSON object on a new line.
+
+    **Example:**
+
+    **Input:**
+
+    ```
+    Context: Apples are red fruits. Bananas are yellow fruits.
+
+    Response: Apples are red. Bananas are green.  Enjoy your fruit!
+    ```
+
+    **Output:**
+
+    {"sentence": "Apples are red.", "label": "supported", "rationale": "The context explicitly states that apples are red.", "excerpt": "Apples are red fruits."}
+    {"sentence": "Bananas are green.", "label": "contradictory", "rationale": "The context states that bananas are yellow, not green.", "excerpt": "Bananas are yellow fruits."}
+    {"sentence": "Enjoy your fruit!", "label": "no_rad", "rationale": "This is a general expression and does not require factual attribution.", "excerpt": null}
+
+    **Now, please analyze the following context and response:**
+
+    **User Query:**
+    {{user_request}}
+
+    **Context:**
+    {{context_document}}
+
+    **Response:**
+    {{response}}
+  template_variables:
+    - user_request
+    - context_document
+    - response
+  metadata:
+    description: "An evaluation prompt from the paper 'The FACTS Grounding Leaderboard: Benchmarking LLMs’ Ability to Ground
+      Responses to Long-Form Input' by Google DeepMind.\n    The prompt was copied from the evaluation_prompts.csv file from
+      Kaggle.\n    This specific prompt elicits an NLI-style sentence-by-sentence checker outputting JSON for each sentence."
+    evaluation_method: json_alt
+    tags:
+      - fact-checking
+    version: 1.0.0
+    author: Google DeepMind
+    source: https://www.kaggle.com/datasets/deepmind/FACTS-grounding-examples?resource=download&select=evaluation_prompts.csv
+  client_parameters: {}
+  custom_data: {}
diff --git a/tests/test_data/facts_grounding_prompts/grounding_nli_json_double_check.yaml b/tests/test_data/facts_grounding_prompts/grounding_nli_json_double_check.yaml
@@ -0,0 +1,23 @@
+prompt:
+  template: |-
+    Your task is to verify whether a given sentence is entailed by a given context or not. Answer only in YES or NO without any additional text. Do not try to avoid answering, or apologize, or give any answer that isn't simply YES or NO.
+
+    **Sentence**
+    {{json_dict["sentence"]}}
+
+    **Context**
+    {{json_dict["excerpt"]}}
+  template_variables:
+    - json_dict
+  metadata:
+    description: "An evaluation prompt from the paper 'The FACTS Grounding Leaderboard: Benchmarking LLMs’ Ability to Ground
+      Responses to Long-Form Input' by Google DeepMind.\n    The prompt was copied from the evaluation_prompts.csv file from
+      Kaggle.\n    This specific prompt elicits a binary entailment/non-entailment classifier. It requires a dict as input"
+    evaluation_method: json_with_double_check
+    tags:
+      - fact-checking
+    version: 1.0.0
+    author: Google DeepMind
+    source: https://www.kaggle.com/datasets/deepmind/FACTS-grounding-examples?resource=download&select=evaluation_prompts.csv
+  client_parameters: {}
+  custom_data: {}