Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add two criteria based direct llm judges #1527

Merged
merged 7 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from unitxt import add_to_catalog
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.llm_as_judge import LLMJudgeDirect
from unitxt.llm_as_judge_constants import (
CriteriaWithOptions,
)

option_map = {
"Excellent": 1.0,
"Good": 0.75,
"mediocre": 0.5,
"Bad": 0.25,
"Very Bad": 0,
}

# First, describe a judgement criteria
adherence_criteria = CriteriaWithOptions.from_obj(
{
"name": "adherence_with_format",
"description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).",
"options": [
{
"name": "Excellent",
"description": "The response perfectly aligns with the requested structure, style, or format, with no deviations.",
},
{
"name": "Good",
"description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability.",
},
{
"name": "mediocre",
"description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present.",
},
{
"name": "Bad",
"description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence.",
},
{
"name": "Very Bad",
"description": "The response fails to align with the requested structure, style, or format.",
},
],
"option_map": option_map,
}
)
add_to_catalog(
adherence_criteria,
f"metrics.llm_as_judge.direct.criterias.{adherence_criteria.name}",
overwrite=True,
)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So they can be reused.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yoavkatz - ok, added the criteria to the catalog

completeness_criteria = CriteriaWithOptions.from_obj(
{
"name": "answer_completeness",
"description": "The response is complete: all the aspects of the reference answer are addressed in the response. The "
"response might use different phrasing or wording from the reference answer.",
"options": [
{
"name": "Excellent",
"description": "The response addresses all aspects of the reference answer.",
},
{
"name": "Good",
"description": "The response addresses most aspects of the reference answer, with minor omissions.",
},
{
"name": "mediocre",
"description": "The response covers the essential aspects of the reference answer but has notable omissions.",
},
{
"name": "Bad",
"description": "The response covers only a few aspects of the reference answer, with significant omissions.",
},
{
"name": "Very Bad",
"description": "The response fails to address the reference answer meaningfully, with most aspects omitted.",
},
],
"option_map": option_map,
}
)
add_to_catalog(
completeness_criteria,
f"metrics.llm_as_judge.direct.criterias.{completeness_criteria.name}",
overwrite=True,
)


# now = define the judge metric using the criteria
adherence_metric = LLMJudgeDirect(
inference_engine=CrossProviderInferenceEngine( # or your favorite inference model
model="llama-3-3-70b-instruct", max_tokens=1024
),
criteria=adherence_criteria,
# the fields from the generation task to be presented to the judge. Those fields must be present
# in the generation task so they can be embedded here
context_fields={
"question": "question",
"instructions": "metadata/template/instruction",
},
criteria_field="criteria",
generate_summaries=False,
check_positional_bias=False,
)
add_to_catalog(
adherence_metric,
"metrics.rag.response_generation.adherence_with_format.llama_3_3_70b_instruct_judge",
overwrite=True,
)

# now = define the judge metric using the criteria
completeness_metric = LLMJudgeDirect(
inference_engine=CrossProviderInferenceEngine( # or your favorite inference model
model="llama-3-3-70b-instruct", max_tokens=1024
),
criteria=completeness_criteria,
# the fields from the generation task to be presented to the judge. Those fields must be present
# in the generation task so they can be embedded here
context_fields={"question": "question", "reference_answers": "reference_answers"},
criteria_field="criteria",
generate_summaries=False,
check_positional_bias=False,
)

add_to_catalog(
completeness_metric,
"metrics.rag.response_generation.answer_completeness.llama_3_3_70b_instruct_judge",
overwrite=True,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"__type__": "criteria_with_options",
"name": "adherence_with_format",
"description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).",
"options": [
{
"__type__": "criteria_option",
"name": "Excellent",
"description": "The response perfectly aligns with the requested structure, style, or format, with no deviations."
},
{
"__type__": "criteria_option",
"name": "Good",
"description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability."
},
{
"__type__": "criteria_option",
"name": "mediocre",
"description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present."
},
{
"__type__": "criteria_option",
"name": "Bad",
"description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence."
},
{
"__type__": "criteria_option",
"name": "Very Bad",
"description": "The response fails to align with the requested structure, style, or format."
}
],
"option_map": {
"Excellent": 1.0,
"Good": 0.75,
"mediocre": 0.5,
"Bad": 0.25,
"Very Bad": 0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"__type__": "criteria_with_options",
"name": "answer_completeness",
"description": "The response is complete: all the aspects of the reference answer are addressed in the response. The response might use different phrasing or wording from the reference answer.",
"options": [
{
"__type__": "criteria_option",
"name": "Excellent",
"description": "The response addresses all aspects of the reference answer."
},
{
"__type__": "criteria_option",
"name": "Good",
"description": "The response addresses most aspects of the reference answer, with minor omissions."
},
{
"__type__": "criteria_option",
"name": "mediocre",
"description": "The response covers the essential aspects of the reference answer but has notable omissions."
},
{
"__type__": "criteria_option",
"name": "Bad",
"description": "The response covers only a few aspects of the reference answer, with significant omissions."
},
{
"__type__": "criteria_option",
"name": "Very Bad",
"description": "The response fails to address the reference answer meaningfully, with most aspects omitted."
}
],
"option_map": {
"Excellent": 1.0,
"Good": 0.75,
"mediocre": 0.5,
"Bad": 0.25,
"Very Bad": 0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"__type__": "llm_judge_direct",
"inference_engine": {
"__type__": "cross_provider_inference_engine",
"model": "llama-3-3-70b-instruct",
"max_tokens": 1024
},
"criteria": {
"__type__": "criteria_with_options",
"name": "adherence_with_format",
"description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).",
"options": [
{
"__type__": "criteria_option",
"name": "Excellent",
"description": "The response perfectly aligns with the requested structure, style, or format, with no deviations."
},
{
"__type__": "criteria_option",
"name": "Good",
"description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability."
},
{
"__type__": "criteria_option",
"name": "mediocre",
"description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present."
},
{
"__type__": "criteria_option",
"name": "Bad",
"description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence."
},
{
"__type__": "criteria_option",
"name": "Very Bad",
"description": "The response fails to align with the requested structure, style, or format."
}
],
"option_map": {
"Excellent": 1.0,
"Good": 0.75,
"mediocre": 0.5,
"Bad": 0.25,
"Very Bad": 0
}
},
"context_fields": {
"question": "question",
"instructions": "metadata/template/instruction"
},
"criteria_field": "criteria",
"generate_summaries": false,
"check_positional_bias": false
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"__type__": "llm_judge_direct",
"inference_engine": {
"__type__": "cross_provider_inference_engine",
"model": "llama-3-3-70b-instruct",
"max_tokens": 1024
},
"criteria": {
"__type__": "criteria_with_options",
"name": "answer_completeness",
"description": "The response is complete: all the aspects of the reference answer are addressed in the response. The response might use different phrasing or wording from the reference answer.",
"options": [
{
"__type__": "criteria_option",
"name": "Excellent",
"description": "The response addresses all aspects of the reference answer."
},
{
"__type__": "criteria_option",
"name": "Good",
"description": "The response addresses most aspects of the reference answer, with minor omissions."
},
{
"__type__": "criteria_option",
"name": "mediocre",
"description": "The response covers the essential aspects of the reference answer but has notable omissions."
},
{
"__type__": "criteria_option",
"name": "Bad",
"description": "The response covers only a few aspects of the reference answer, with significant omissions."
},
{
"__type__": "criteria_option",
"name": "Very Bad",
"description": "The response fails to address the reference answer meaningfully, with most aspects omitted."
}
],
"option_map": {
"Excellent": 1.0,
"Good": 0.75,
"mediocre": 0.5,
"Bad": 0.25,
"Very Bad": 0
}
},
"context_fields": {
"question": "question",
"reference_answers": "reference_answers"
},
"criteria_field": "criteria",
"generate_summaries": false,
"check_positional_bias": false
}
Loading