Add two criteria based direct llm judges (#1527)

* Add criteria based direct judges Signed-off-by: lilacheden <[email protected]> * Add criteria to catalog Signed-off-by: lilacheden <[email protected]> * Move judges under rag.response_generation Signed-off-by: lilacheden <[email protected]> * fix typo in prompt Signed-off-by: lilacheden <[email protected]> --------- Co-authored-by: Yoav Katz <[email protected]>
IBM · Jan 21, 2025 · bcc7b6a · bcc7b6a
1 parent f684a22
commit bcc7b6a
Show file tree

Hide file tree

Showing 5 changed files with 315 additions and 0 deletions.
diff --git a/prepare/metrics/llm_as_judge/direct/llama_3_3_70b_instruct_adherence_completeness.py b/prepare/metrics/llm_as_judge/direct/llama_3_3_70b_instruct_adherence_completeness.py
@@ -0,0 +1,129 @@
+from unitxt import add_to_catalog
+from unitxt.inference import CrossProviderInferenceEngine
+from unitxt.llm_as_judge import LLMJudgeDirect
+from unitxt.llm_as_judge_constants import (
+    CriteriaWithOptions,
+)
+
+option_map = {
+    "Excellent": 1.0,
+    "Good": 0.75,
+    "mediocre": 0.5,
+    "Bad": 0.25,
+    "Very Bad": 0,
+}
+
+# First, describe a judgement criteria
+adherence_criteria = CriteriaWithOptions.from_obj(
+    {
+        "name": "adherence_with_format",
+        "description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).",
+        "options": [
+            {
+                "name": "Excellent",
+                "description": "The response perfectly aligns with the requested structure, style, or format, with no deviations.",
+            },
+            {
+                "name": "Good",
+                "description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability.",
+            },
+            {
+                "name": "mediocre",
+                "description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present.",
+            },
+            {
+                "name": "Bad",
+                "description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence.",
+            },
+            {
+                "name": "Very Bad",
+                "description": "The response fails to align with the requested structure, style, or format.",
+            },
+        ],
+        "option_map": option_map,
+    }
+)
+add_to_catalog(
+    adherence_criteria,
+    f"metrics.llm_as_judge.direct.criterias.{adherence_criteria.name}",
+    overwrite=True,
+)
+
+completeness_criteria = CriteriaWithOptions.from_obj(
+    {
+        "name": "answer_completeness",
+        "description": "The response is complete: all the aspects of the reference answer are addressed in the response. The "
+        "response might use different phrasing or wording from the reference answer.",
+        "options": [
+            {
+                "name": "Excellent",
+                "description": "The response addresses all aspects of the reference answer.",
+            },
+            {
+                "name": "Good",
+                "description": "The response addresses most aspects of the reference answer, with minor omissions.",
+            },
+            {
+                "name": "mediocre",
+                "description": "The response covers the essential aspects of the reference answer but has notable omissions.",
+            },
+            {
+                "name": "Bad",
+                "description": "The response covers only a few aspects of the reference answer, with significant omissions.",
+            },
+            {
+                "name": "Very Bad",
+                "description": "The response fails to address the reference answer meaningfully, with most aspects omitted.",
+            },
+        ],
+        "option_map": option_map,
+    }
+)
+add_to_catalog(
+    completeness_criteria,
+    f"metrics.llm_as_judge.direct.criterias.{completeness_criteria.name}",
+    overwrite=True,
+)
+
+
+# now = define the judge metric using the criteria
+adherence_metric = LLMJudgeDirect(
+    inference_engine=CrossProviderInferenceEngine(  # or your favorite inference model
+        model="llama-3-3-70b-instruct", max_tokens=1024
+    ),
+    criteria=adherence_criteria,
+    # the fields from the generation task to be presented to the judge. Those fields must be present
+    # in the generation task so they can be embedded here
+    context_fields={
+        "question": "question",
+        "instructions": "metadata/template/instruction",
+    },
+    criteria_field="criteria",
+    generate_summaries=False,
+    check_positional_bias=False,
+)
+add_to_catalog(
+    adherence_metric,
+    "metrics.rag.response_generation.adherence_with_format.llama_3_3_70b_instruct_judge",
+    overwrite=True,
+)
+
+# now = define the judge metric using the criteria
+completeness_metric = LLMJudgeDirect(
+    inference_engine=CrossProviderInferenceEngine(  # or your favorite inference model
+        model="llama-3-3-70b-instruct", max_tokens=1024
+    ),
+    criteria=completeness_criteria,
+    # the fields from the generation task to be presented to the judge. Those fields must be present
+    # in the generation task so they can be embedded here
+    context_fields={"question": "question", "reference_answers": "reference_answers"},
+    criteria_field="criteria",
+    generate_summaries=False,
+    check_positional_bias=False,
+)
+
+add_to_catalog(
+    completeness_metric,
+    "metrics.rag.response_generation.answer_completeness.llama_3_3_70b_instruct_judge",
+    overwrite=True,
+)
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/criterias/adherence_with_format.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/criterias/adherence_with_format.json
@@ -0,0 +1,39 @@
+{
+    "__type__": "criteria_with_options",
+    "name": "adherence_with_format",
+    "description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).",
+    "options": [
+        {
+            "__type__": "criteria_option",
+            "name": "Excellent",
+            "description": "The response perfectly aligns with the requested structure, style, or format, with no deviations."
+        },
+        {
+            "__type__": "criteria_option",
+            "name": "Good",
+            "description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability."
+        },
+        {
+            "__type__": "criteria_option",
+            "name": "mediocre",
+            "description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present."
+        },
+        {
+            "__type__": "criteria_option",
+            "name": "Bad",
+            "description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence."
+        },
+        {
+            "__type__": "criteria_option",
+            "name": "Very Bad",
+            "description": "The response fails to align with the requested structure, style, or format."
+        }
+    ],
+    "option_map": {
+        "Excellent": 1.0,
+        "Good": 0.75,
+        "mediocre": 0.5,
+        "Bad": 0.25,
+        "Very Bad": 0
+    }
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/criterias/answer_completeness.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/criterias/answer_completeness.json
@@ -0,0 +1,39 @@
+{
+    "__type__": "criteria_with_options",
+    "name": "answer_completeness",
+    "description": "The response is complete: all the aspects of the reference answer are addressed in the response. The response might use different phrasing or wording from the reference answer.",
+    "options": [
+        {
+            "__type__": "criteria_option",
+            "name": "Excellent",
+            "description": "The response addresses all aspects of the reference answer."
+        },
+        {
+            "__type__": "criteria_option",
+            "name": "Good",
+            "description": "The response addresses most aspects of the reference answer, with minor omissions."
+        },
+        {
+            "__type__": "criteria_option",
+            "name": "mediocre",
+            "description": "The response covers the essential aspects of the reference answer but has notable omissions."
+        },
+        {
+            "__type__": "criteria_option",
+            "name": "Bad",
+            "description": "The response covers only a few aspects of the reference answer, with significant omissions."
+        },
+        {
+            "__type__": "criteria_option",
+            "name": "Very Bad",
+            "description": "The response fails to address the reference answer meaningfully, with most aspects omitted."
+        }
+    ],
+    "option_map": {
+        "Excellent": 1.0,
+        "Good": 0.75,
+        "mediocre": 0.5,
+        "Bad": 0.25,
+        "Very Bad": 0
+    }
+}
diff --git a/...g/metrics/rag/response_generation/adherence_with_format/llama_3_3_70b_instruct_judge.json b/...g/metrics/rag/response_generation/adherence_with_format/llama_3_3_70b_instruct_judge.json
@@ -0,0 +1,54 @@
+{
+    "__type__": "llm_judge_direct",
+    "inference_engine": {
+        "__type__": "cross_provider_inference_engine",
+        "model": "llama-3-3-70b-instruct",
+        "max_tokens": 1024
+    },
+    "criteria": {
+        "__type__": "criteria_with_options",
+        "name": "adherence_with_format",
+        "description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).",
+        "options": [
+            {
+                "__type__": "criteria_option",
+                "name": "Excellent",
+                "description": "The response perfectly aligns with the requested structure, style, or format, with no deviations."
+            },
+            {
+                "__type__": "criteria_option",
+                "name": "Good",
+                "description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability."
+            },
+            {
+                "__type__": "criteria_option",
+                "name": "mediocre",
+                "description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present."
+            },
+            {
+                "__type__": "criteria_option",
+                "name": "Bad",
+                "description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence."
+            },
+            {
+                "__type__": "criteria_option",
+                "name": "Very Bad",
+                "description": "The response fails to align with the requested structure, style, or format."
+            }
+        ],
+        "option_map": {
+            "Excellent": 1.0,
+            "Good": 0.75,
+            "mediocre": 0.5,
+            "Bad": 0.25,
+            "Very Bad": 0
+        }
+    },
+    "context_fields": {
+        "question": "question",
+        "instructions": "metadata/template/instruction"
+    },
+    "criteria_field": "criteria",
+    "generate_summaries": false,
+    "check_positional_bias": false
+}
diff --git a/...log/metrics/rag/response_generation/answer_completeness/llama_3_3_70b_instruct_judge.json b/...log/metrics/rag/response_generation/answer_completeness/llama_3_3_70b_instruct_judge.json
@@ -0,0 +1,54 @@
+{
+    "__type__": "llm_judge_direct",
+    "inference_engine": {
+        "__type__": "cross_provider_inference_engine",
+        "model": "llama-3-3-70b-instruct",
+        "max_tokens": 1024
+    },
+    "criteria": {
+        "__type__": "criteria_with_options",
+        "name": "answer_completeness",
+        "description": "The response is complete: all the aspects of the reference answer are addressed in the response. The response might use different phrasing or wording from the reference answer.",
+        "options": [
+            {
+                "__type__": "criteria_option",
+                "name": "Excellent",
+                "description": "The response addresses all aspects of the reference answer."
+            },
+            {
+                "__type__": "criteria_option",
+                "name": "Good",
+                "description": "The response addresses most aspects of the reference answer, with minor omissions."
+            },
+            {
+                "__type__": "criteria_option",
+                "name": "mediocre",
+                "description": "The response covers the essential aspects of the reference answer but has notable omissions."
+            },
+            {
+                "__type__": "criteria_option",
+                "name": "Bad",
+                "description": "The response covers only a few aspects of the reference answer, with significant omissions."
+            },
+            {
+                "__type__": "criteria_option",
+                "name": "Very Bad",
+                "description": "The response fails to address the reference answer meaningfully, with most aspects omitted."
+            }
+        ],
+        "option_map": {
+            "Excellent": 1.0,
+            "Good": 0.75,
+            "mediocre": 0.5,
+            "Bad": 0.25,
+            "Very Bad": 0
+        }
+    },
+    "context_fields": {
+        "question": "question",
+        "reference_answers": "reference_answers"
+    },
+    "criteria_field": "criteria",
+    "generate_summaries": false,
+    "check_positional_bias": false
+}