Skip to content

Commit

Permalink
Add two criteria based direct llm judges (#1527)
Browse files Browse the repository at this point in the history
* Add criteria based direct judges
Signed-off-by: lilacheden <[email protected]>

* Add criteria to catalog
Signed-off-by: lilacheden <[email protected]>

* Move  judges under rag.response_generation
Signed-off-by: lilacheden <[email protected]>

* fix typo in prompt
Signed-off-by: lilacheden <[email protected]>

---------

Co-authored-by: Yoav Katz <[email protected]>
  • Loading branch information
lilacheden and yoavkatz authored Jan 21, 2025
1 parent f684a22 commit bcc7b6a
Show file tree
Hide file tree
Showing 5 changed files with 315 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from unitxt import add_to_catalog
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.llm_as_judge import LLMJudgeDirect
from unitxt.llm_as_judge_constants import (
CriteriaWithOptions,
)

option_map = {
"Excellent": 1.0,
"Good": 0.75,
"mediocre": 0.5,
"Bad": 0.25,
"Very Bad": 0,
}

# First, describe a judgement criteria
adherence_criteria = CriteriaWithOptions.from_obj(
{
"name": "adherence_with_format",
"description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).",
"options": [
{
"name": "Excellent",
"description": "The response perfectly aligns with the requested structure, style, or format, with no deviations.",
},
{
"name": "Good",
"description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability.",
},
{
"name": "mediocre",
"description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present.",
},
{
"name": "Bad",
"description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence.",
},
{
"name": "Very Bad",
"description": "The response fails to align with the requested structure, style, or format.",
},
],
"option_map": option_map,
}
)
add_to_catalog(
adherence_criteria,
f"metrics.llm_as_judge.direct.criterias.{adherence_criteria.name}",
overwrite=True,
)

completeness_criteria = CriteriaWithOptions.from_obj(
{
"name": "answer_completeness",
"description": "The response is complete: all the aspects of the reference answer are addressed in the response. The "
"response might use different phrasing or wording from the reference answer.",
"options": [
{
"name": "Excellent",
"description": "The response addresses all aspects of the reference answer.",
},
{
"name": "Good",
"description": "The response addresses most aspects of the reference answer, with minor omissions.",
},
{
"name": "mediocre",
"description": "The response covers the essential aspects of the reference answer but has notable omissions.",
},
{
"name": "Bad",
"description": "The response covers only a few aspects of the reference answer, with significant omissions.",
},
{
"name": "Very Bad",
"description": "The response fails to address the reference answer meaningfully, with most aspects omitted.",
},
],
"option_map": option_map,
}
)
add_to_catalog(
completeness_criteria,
f"metrics.llm_as_judge.direct.criterias.{completeness_criteria.name}",
overwrite=True,
)


# now = define the judge metric using the criteria
adherence_metric = LLMJudgeDirect(
inference_engine=CrossProviderInferenceEngine( # or your favorite inference model
model="llama-3-3-70b-instruct", max_tokens=1024
),
criteria=adherence_criteria,
# the fields from the generation task to be presented to the judge. Those fields must be present
# in the generation task so they can be embedded here
context_fields={
"question": "question",
"instructions": "metadata/template/instruction",
},
criteria_field="criteria",
generate_summaries=False,
check_positional_bias=False,
)
add_to_catalog(
adherence_metric,
"metrics.rag.response_generation.adherence_with_format.llama_3_3_70b_instruct_judge",
overwrite=True,
)

# now = define the judge metric using the criteria
completeness_metric = LLMJudgeDirect(
inference_engine=CrossProviderInferenceEngine( # or your favorite inference model
model="llama-3-3-70b-instruct", max_tokens=1024
),
criteria=completeness_criteria,
# the fields from the generation task to be presented to the judge. Those fields must be present
# in the generation task so they can be embedded here
context_fields={"question": "question", "reference_answers": "reference_answers"},
criteria_field="criteria",
generate_summaries=False,
check_positional_bias=False,
)

add_to_catalog(
completeness_metric,
"metrics.rag.response_generation.answer_completeness.llama_3_3_70b_instruct_judge",
overwrite=True,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"__type__": "criteria_with_options",
"name": "adherence_with_format",
"description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).",
"options": [
{
"__type__": "criteria_option",
"name": "Excellent",
"description": "The response perfectly aligns with the requested structure, style, or format, with no deviations."
},
{
"__type__": "criteria_option",
"name": "Good",
"description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability."
},
{
"__type__": "criteria_option",
"name": "mediocre",
"description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present."
},
{
"__type__": "criteria_option",
"name": "Bad",
"description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence."
},
{
"__type__": "criteria_option",
"name": "Very Bad",
"description": "The response fails to align with the requested structure, style, or format."
}
],
"option_map": {
"Excellent": 1.0,
"Good": 0.75,
"mediocre": 0.5,
"Bad": 0.25,
"Very Bad": 0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"__type__": "criteria_with_options",
"name": "answer_completeness",
"description": "The response is complete: all the aspects of the reference answer are addressed in the response. The response might use different phrasing or wording from the reference answer.",
"options": [
{
"__type__": "criteria_option",
"name": "Excellent",
"description": "The response addresses all aspects of the reference answer."
},
{
"__type__": "criteria_option",
"name": "Good",
"description": "The response addresses most aspects of the reference answer, with minor omissions."
},
{
"__type__": "criteria_option",
"name": "mediocre",
"description": "The response covers the essential aspects of the reference answer but has notable omissions."
},
{
"__type__": "criteria_option",
"name": "Bad",
"description": "The response covers only a few aspects of the reference answer, with significant omissions."
},
{
"__type__": "criteria_option",
"name": "Very Bad",
"description": "The response fails to address the reference answer meaningfully, with most aspects omitted."
}
],
"option_map": {
"Excellent": 1.0,
"Good": 0.75,
"mediocre": 0.5,
"Bad": 0.25,
"Very Bad": 0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"__type__": "llm_judge_direct",
"inference_engine": {
"__type__": "cross_provider_inference_engine",
"model": "llama-3-3-70b-instruct",
"max_tokens": 1024
},
"criteria": {
"__type__": "criteria_with_options",
"name": "adherence_with_format",
"description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).",
"options": [
{
"__type__": "criteria_option",
"name": "Excellent",
"description": "The response perfectly aligns with the requested structure, style, or format, with no deviations."
},
{
"__type__": "criteria_option",
"name": "Good",
"description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability."
},
{
"__type__": "criteria_option",
"name": "mediocre",
"description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present."
},
{
"__type__": "criteria_option",
"name": "Bad",
"description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence."
},
{
"__type__": "criteria_option",
"name": "Very Bad",
"description": "The response fails to align with the requested structure, style, or format."
}
],
"option_map": {
"Excellent": 1.0,
"Good": 0.75,
"mediocre": 0.5,
"Bad": 0.25,
"Very Bad": 0
}
},
"context_fields": {
"question": "question",
"instructions": "metadata/template/instruction"
},
"criteria_field": "criteria",
"generate_summaries": false,
"check_positional_bias": false
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"__type__": "llm_judge_direct",
"inference_engine": {
"__type__": "cross_provider_inference_engine",
"model": "llama-3-3-70b-instruct",
"max_tokens": 1024
},
"criteria": {
"__type__": "criteria_with_options",
"name": "answer_completeness",
"description": "The response is complete: all the aspects of the reference answer are addressed in the response. The response might use different phrasing or wording from the reference answer.",
"options": [
{
"__type__": "criteria_option",
"name": "Excellent",
"description": "The response addresses all aspects of the reference answer."
},
{
"__type__": "criteria_option",
"name": "Good",
"description": "The response addresses most aspects of the reference answer, with minor omissions."
},
{
"__type__": "criteria_option",
"name": "mediocre",
"description": "The response covers the essential aspects of the reference answer but has notable omissions."
},
{
"__type__": "criteria_option",
"name": "Bad",
"description": "The response covers only a few aspects of the reference answer, with significant omissions."
},
{
"__type__": "criteria_option",
"name": "Very Bad",
"description": "The response fails to address the reference answer meaningfully, with most aspects omitted."
}
],
"option_map": {
"Excellent": 1.0,
"Good": 0.75,
"mediocre": 0.5,
"Bad": 0.25,
"Very Bad": 0
}
},
"context_fields": {
"question": "question",
"reference_answers": "reference_answers"
},
"criteria_field": "criteria",
"generate_summaries": false,
"check_positional_bias": false
}

0 comments on commit bcc7b6a

Please sign in to comment.