instructlab · alimaredia · Jan 12, 2025
diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py
@@ -12,8 +12,8 @@
 from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
 from ragas.metrics import Metric
 from ragas.metrics._domain_specific_rubrics import (  # the rubrics we must instantiate are located inside of a file marked as private
-    DEFAULT_WITH_REFERENCE_RUBRICS,
     RubricsScore,
+    SingleTurnPrompt,
 )
 
 # Local
@@ -22,6 +22,23 @@
 
 logger = setup_logger(__name__)
 
+RUBRIC = """You are an evaluation system tasked with assessing the answer quality of a AI generated response in relation to the posed question and reference answer. Assess if the response is correct, accurate, and factual based on the reference answer.
+    For evaluating factuality of the answer look at the reference answer compare the model answer to it.
+    Evaluate the answer_quality as:
+    - Score 1: The response is completely incorrect, inaccurate, and/or not factual.
+    - Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
+    - Score 3: The response is somewhat correct, accurate, and/or factual.
+    - Score 4: The response is mostly correct, accurate, and factual.
+    - Score 5: The response is completely correct, accurate, and factual.
+    Here is the question: \n ------- \n {user_input} \n -------
+    Here is model answer: \n ------- \n {response} \n -------
+    Here is the reference answer(may be very short and lack details or indirect, long and extractive):  \n ------- \n {reference} \n ------- \n
+    Assess the quality of model answer with respect to the Reference Answer, but do not penalize the model answer for adding details or give a direct answer to user question.
+    Approach your evaluation in step-by-step manner.
+    For evaluating first list out keys facts covered in the reference answer and check how many are covered by the model answer.
+    If the question or reference answer is about steps then check if the steps and their order in model answer match with reference answer.
+    Provide your response as JSON object with two keys: 'reasoning' and 'answer_quality'."""
+
 
 class Sample(TypedDict):
     """
@@ -257,8 +274,10 @@ def _generate_answers_from_model(
     @staticmethod
     def _get_metrics() -> List[Metric]:
         # default set of metrics
+        st_prompt = SingleTurnPrompt()
+        st_prompt.instruction = RUBRIC
         return [
             RubricsScore(
-                rubrics=DEFAULT_WITH_REFERENCE_RUBRICS,
+                single_turn_prompt=st_prompt,
             )
         ]